Ejemplo n.º 1
0
def save_colors_and_ls(fpaths):
    if not dict_color_and_ls:
        color_id = 0
        for fpath in fpaths:
            ls = primary_line_style
            label = qutils.label_from_fpath(fpath)
            # contigs and scaffolds should be equally colored but scaffolds should be dashed
            if fpath and fpath in qconfig.dict_of_broken_scaffolds:
                color = dict_color_and_ls[qutils.label_from_fpath(qconfig.dict_of_broken_scaffolds[fpath])][0]
                ls = secondary_line_style
            else:
                 color = colors[color_id % len(colors)]
                 color_id += 1
            dict_color_and_ls[label] = (color, ls)
Ejemplo n.º 2
0
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning("GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    elif prokaryote:
        tool_name = 'GeneMarkS'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC
    else:
        tool_name = 'GeneMark-ES'
        tool_dirname = 'genemark-es'
        gmhmm_p_function = gm_es

    logger.main_info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning('  Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name)
    else:
        successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name))
        if not successful:
            return

        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        num_threads = max(1, qconfig.max_threads // n_jobs)
        from joblib import Parallel, delayed
        results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)(
            index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads)
            for index, fasta_fpath in enumerate(fasta_fpaths))

        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            unique_count, count = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count)
            if count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES, count)
            if unique_count is None and count is None:
                logger.error('  ' + qutils.index_to_str(i) +
                     'Failed predicting genes in ' + qutils.label_from_fpath(fasta_path) + '. ' +
                     ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).'
                         if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else ''))

        if not qconfig.debug:
            shutil.rmtree(tmp_dirpath)

        logger.main_info('Done.')
Ejemplo n.º 3
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    err_fpath = os.path.join(out_dirpath, assembly_name + '_genemark.stderr')

    genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath)

    if not genes:
        unique_count = None
        count = None  # [None] * len(gene_lengths)
    else:
        tool_name = "genemark"
        out_gff_fpath = os.path.join(out_dirpath, assembly_name + '_' + tool_name + '_genes.gff')
        add_genes_to_gff(genes, out_gff_fpath)
        if OUTPUT_FASTA:
            out_fasta_fpath = os.path.join(out_dirpath, assembly_name + '_' + tool_name + '_genes.fasta')
            add_genes_to_fasta(genes, out_fasta_fpath)

        count = [sum([gene[3] - gene[2] > x for gene in genes]) for x in gene_lengths]
        unique_count = len(set([gene[4] for gene in genes]))
        total_count = len(genes)

        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_fpath)

    return unique_count, count
Ejemplo n.º 4
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath,
                  tool_dirpath, tmp_dirpath):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    out_fpath = os.path.join(out_dirpath, assembly_name + '_glimmer')
    err_fpath = os.path.join(out_dirpath, assembly_name + '_glimmer.stderr')

    #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir,
    #    fasta_path, out_path, gene_lengths, err_path)

    out_gff_path, unique, total, cnt = glimmerHMM(tool_dirpath, contigs_fpath,
                                                  out_fpath, gene_lengths,
                                                  err_fpath, tmp_dirpath,
                                                  index)

    if out_gff_path:
        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' +
                    str(unique) + ' unique, ' + str(total) + ' total')
        logger.info('  ' + qutils.index_to_str(index) +
                    '  Predicted genes (GFF): ' + out_gff_path)

    return unique, cnt
Ejemplo n.º 5
0
Archivo: gage.py Proyecto: zjwang6/TS
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference,
             tmp_dir):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(i) + assembly_label + '...')

    # run gage tool
    log_out_fpath = os.path.join(gage_results_dirpath,
                                 'gage_' + assembly_name + '.stdout')
    log_err_fpath = os.path.join(gage_results_dirpath,
                                 'gage_' + assembly_name + '.stderr')
    logger.info('  ' + qutils.index_to_str(i) + 'Logging to files ' +
                os.path.basename(log_out_fpath) + ' and ' +
                os.path.basename(log_err_fpath) + '...')
    log_out_f = open(log_out_fpath, 'w')
    log_err_f = open(log_err_fpath, 'w')

    return_code = qutils.call_subprocess([
        'sh', gage_tool_path, reference, contigs_fpath, tmp_dir,
        str(qconfig.min_contig)
    ],
                                         stdout=log_out_f,
                                         stderr=log_err_f,
                                         indent='  ' + qutils.index_to_str(i),
                                         only_if_debug=False)
    if return_code == 0:
        logger.info('  ' + qutils.index_to_str(i) + 'Failed.')
    else:
        logger.info('  ' + qutils.index_to_str(i) + 'Done.')

    log_out_f.close()
    log_err_f.close()

    return return_code
Ejemplo n.º 6
0
def _handle_fasta(contigs_fpath, corr_fpath, reporting):
    lengths = fastaparser.get_lengths_from_fastafile(contigs_fpath)

    if not sum(l for l in lengths if l >= qconfig.min_contig):
        logger.warning(
            "Skipping %s because it doesn't contain contigs >= %d bp." %
            (qutils.label_from_fpath(corr_fpath), qconfig.min_contig))
        return False

    # correcting
    if not correct_fasta(contigs_fpath, corr_fpath, qconfig.min_contig):
        return False

    ## filling column "Assembly" with names of assemblies
    report = reporting.get(corr_fpath)

    ## filling columns "Number of contigs >=110 bp", ">=200 bp", ">=500 bp"
    report.add_field(reporting.Fields.CONTIGS__FOR_THRESHOLDS, [
        sum(1 for l in lengths if l >= threshold)
        for threshold in qconfig.contig_thresholds
    ])
    report.add_field(reporting.Fields.TOTALLENS__FOR_THRESHOLDS, [
        sum(l for l in lengths if l >= threshold)
        for threshold in qconfig.contig_thresholds
    ])
    return True
Ejemplo n.º 7
0
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(i) + assembly_label + '...')

    # run gage tool
    log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stdout')
    log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stderr')
    logger.info('  ' + qutils.index_to_str(i) + 'Logging to files ' +
                os.path.basename(log_out_fpath) + ' and ' +
                os.path.basename(log_err_fpath) + '...')
    log_out_f = open(log_out_fpath, 'w')
    log_err_f = open(log_err_fpath, 'w')

    return_code = qutils.call_subprocess(
        ['sh', gage_tool_path, reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)],
        stdout=log_out_f,
        stderr=log_err_f,
        indent='  ' + qutils.index_to_str(i),
        only_if_debug=False)
    if return_code != 0:
        logger.info('  ' + qutils.index_to_str(i) + 'Failed.')
    else:
        logger.info('  ' + qutils.index_to_str(i) + 'Done.')

    log_out_f.close()
    log_err_f.close()

    return return_code
Ejemplo n.º 8
0
def save_features_in_contigs(output_dirpath, contigs_fpaths, feature_name, features_in_contigs, ref_features_num):
    return save(output_dirpath + prefix_fn + feature_name + in_contigs_suffix_fn, {
        'filenames': map(qutils.label_from_fpath, contigs_fpaths),
        feature_name + '_in_contigs': dict((qutils.label_from_fpath(contigs_fpath), feature_amounts)
                                           for (contigs_fpath, feature_amounts) in features_in_contigs.items()),
        'ref_' + feature_name + '_number': ref_features_num,
    })
Ejemplo n.º 9
0
def get_color_and_ls(fpath, label=None):
    if not label:
        label = qutils.label_from_fpath(fpath)
    """
    Returns tuple: color, line style
    """
    return dict_color_and_ls[label]
Ejemplo n.º 10
0
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess(
                [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
                stdout=err_file,
                stderr=err_file,
                indent='  ' + qutils.index_to_str(index) + '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for ind, seq in read_fasta(fasta_fpath):
        contig_path = os.path.join(base_dir, ind + '.fasta')
        gff_path = os.path.join(base_dir, ind + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        logger.error(
            'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option'
            ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(fasta_fpath))
        return None, None, None, None

    out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff')
    unique, total = set(), 0
    genes = []
    cnt = [0] * len(gene_lengths)
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start:end + 1]
        else:
            gene_seq = rev_comp(contigs[contig][start:end + 1])
        if gene_seq not in unique:
            unique.add(gene_seq)
        genes.append((gene_id, gene_seq))
        for idx, gene_length in enumerate(gene_lengths):
            cnt[idx] += end - start > gene_length

    if OUTPUT_FASTA:
        out_fasta_path = out_fpath + '_genes.fasta'
        write_fasta(out_fasta_path, genes)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, len(unique), total, cnt
Ejemplo n.º 11
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tool_src_dirpath = os.path.join(tool_dirpath, 'src')
    tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')

    if not os.path.isfile(tool_exec_fpath):
        # making
        logger.main_info("Compiling GlimmerHMM...")
        return_code = qutils.call_subprocess(
            ['make', '-C', tool_src_dirpath],
            stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'),
            stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'),
            indent='    ')
        if return_code != 0 or not os.path.isfile(tool_exec_fpath):
            logger.error(
                "Failed to compile GlimmerHMM (" + tool_src_dirpath +
                ")!\nTry to compile it manually or do not use --gene-finding "
                "option with --eukaryote.\nUse --debug option to see the command lines."
            )
            return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    results = Parallel(n_jobs=n_jobs)(
        delayed(predict_genes)(index, contigs_fpath, gene_lengths, out_dirpath,
                               tool_dirpath, tmp_dirpath)
        for index, contigs_fpath in enumerate(contigs_fpaths))

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        unique, cnt = results[i]
        if unique is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique)
        if cnt is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES, cnt)
        if unique is None and cnt is None:
            logger.error(
                'Glimmer failed running Glimmer for %s. ' +
                ('Run with the --debug option'
                 ' to see the command line.' if not qconfig.debug else '') %
                qutils.label_from_fpath(contigs_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
Ejemplo n.º 12
0
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath,
                               alignments_fpath_template):
    assembly_label = qutils.label_from_fpath(asm.fpath)
    logger.info('  ' + 'processing ' + assembly_label)
    added_ref_asm = []
    not_aligned_fname = assembly_label + '_not_aligned_anywhere.fasta'
    not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname)
    contigs = {}
    aligned_contig_names = set()
    aligned_contigs_for_each_ref = {}
    contigs_seq = fastaparser.read_fasta_one_time(asm.fpath)
    if os.path.exists(alignments_fpath_template % assembly_label):
        for line in open(alignments_fpath_template % assembly_label):
            values = line.split()
            if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys():
                ref_name = contigs_analyzer.ref_labels_by_chromosomes[
                    values[0]]
                ref_contigs_names = values[1:]
                ref_contigs_fpath = os.path.join(
                    corrected_dirpath,
                    assembly_label + '_to_' + ref_name[:40] + '.fasta')
                if ref_name not in aligned_contigs_for_each_ref:
                    aligned_contigs_for_each_ref[ref_name] = []

                for (cont_name, seq) in contigs_seq:
                    if not cont_name in contigs:
                        contigs[cont_name] = seq

                    if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[
                            ref_name]:
                        # Collecting all aligned contigs names in order to futher extract not-aligned
                        aligned_contig_names.add(cont_name)
                        aligned_contigs_for_each_ref[ref_name].append(
                            cont_name)
                        fastaparser.write_fasta(ref_contigs_fpath,
                                                [(cont_name, seq)], 'a')

                ref_asm = Assembly(ref_contigs_fpath, assembly_label)
                if ref_asm.name not in added_ref_asm:
                    if ref_name in assemblies_by_ref:
                        assemblies_by_ref[ref_name].append(ref_asm)
                        added_ref_asm.append(ref_asm.name)

    # Exctraction not aligned contigs
    all_contigs_names = set(contigs.keys())
    not_aligned_contigs_names = all_contigs_names - aligned_contig_names
    fastaparser.write_fasta(not_aligned_fpath,
                            [(name, contigs[name])
                             for name in not_aligned_contigs_names])

    not_aligned_asm = Assembly(not_aligned_fpath, asm.label)
    return assemblies_by_ref, not_aligned_asm
Ejemplo n.º 13
0
def save_features_in_contigs(output_dirpath, contigs_fpaths, feature_name,
                             features_in_contigs, ref_features_num):
    return save(
        output_dirpath + prefix_fn + feature_name + in_contigs_suffix_fn, {
            'filenames':
            map(qutils.label_from_fpath, contigs_fpaths),
            feature_name + '_in_contigs':
            dict((qutils.label_from_fpath(contigs_fpath), feature_amounts)
                 for (contigs_fpath,
                      feature_amounts) in features_in_contigs.items()),
            'ref_' + feature_name + '_number':
            ref_features_num,
        })
Ejemplo n.º 14
0
Archivo: glimmer.py Proyecto: ctb/quast
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tool_src_dirpath = os.path.join(tool_dirpath, 'src')
    tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')

    if not os.path.isfile(tool_exec_fpath):
        # making
        logger.main_info("Compiling GlimmerHMM...")
        return_code = qutils.call_subprocess(
            ['make', '-C', tool_src_dirpath],
            stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'),
            stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'),
            indent='    ')
        if return_code != 0 or not os.path.isfile(tool_exec_fpath):
            logger.error("Failed to compile GlimmerHMM (" + tool_src_dirpath +
                         ")!\nTry to compile it manually or do not use --gene-finding "
                         "option with --eukaryote.\nUse --debug option to see the command lines.")
            return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)(
        index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath)
        for index, contigs_fpath in enumerate(contigs_fpaths))

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        unique, cnt = results[i]
        if unique is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique)
        if cnt is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES, cnt)
        if unique is None and cnt is None:
            logger.error(
                'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option'
                ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(contigs_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
Ejemplo n.º 15
0
def save_colors(results_dirpath, contigs_fpaths, dict_colors, meta=False):  # coordinates for Nx, NAx, NGx, NGAX
    from libs import plotter

    if meta:
        html_fpath = os.path.join(results_dirpath, report_fname)
        with open(html_fpath) as f_html:
            html_text = f_html.read()
        html_text = re.sub("{{ " + "colors" + " }}", "standard_colors", html_text)
        with open(html_fpath, "w") as f_html:
            f_html.write(html_text)
    else:
        colors_and_ls = [dict_colors[qutils.label_from_fpath(contigs_fpath)] for contigs_fpath in contigs_fpaths]
        colors = [color_and_ls[0] for color_and_ls in colors_and_ls]
        colors_for_html = [html_colors[plotter.colors.index(color)] for color in colors]
        json_fpath = json_saver.save_colors(results_dirpath, colors_for_html)
        append(results_dirpath, json_fpath, "colors")
Ejemplo n.º 16
0
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template):
    assembly_label = qutils.label_from_fpath(asm.fpath)
    logger.info('  ' + 'processing ' + assembly_label)
    added_ref_asm = []
    not_aligned_fname = assembly_label + '_not_aligned_anywhere.fasta'
    not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname)
    contigs = {}
    aligned_contig_names = set()
    aligned_contigs_for_each_ref = {}
    contigs_seq = fastaparser.read_fasta_one_time(asm.fpath)
    if os.path.exists(alignments_fpath_template % assembly_label):
        for line in open(alignments_fpath_template % assembly_label):
            values = line.split()
            if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys():
                ref_name = contigs_analyzer.ref_labels_by_chromosomes[values[0]]
                ref_contigs_names = values[1:]
                ref_contigs_fpath = os.path.join(
                    corrected_dirpath, assembly_label + '_to_' + ref_name[:40] + '.fasta')
                if ref_name not in aligned_contigs_for_each_ref:
                    aligned_contigs_for_each_ref[ref_name] = []

                for (cont_name, seq) in contigs_seq:
                    if not cont_name in contigs:
                        contigs[cont_name] = seq

                    if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ref_name]:
                        # Collecting all aligned contigs names in order to futher extract not-aligned
                        aligned_contig_names.add(cont_name)
                        aligned_contigs_for_each_ref[ref_name].append(cont_name)
                        fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a')

                ref_asm = Assembly(ref_contigs_fpath, assembly_label)
                if ref_asm.name not in added_ref_asm:
                    if ref_name in assemblies_by_ref:
                        assemblies_by_ref[ref_name].append(ref_asm)
                        added_ref_asm.append(ref_asm.name)

    # Exctraction not aligned contigs
    all_contigs_names = set(contigs.keys())
    not_aligned_contigs_names = all_contigs_names - aligned_contig_names
    fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names])

    not_aligned_asm = Assembly(not_aligned_fpath, asm.label)
    return assemblies_by_ref, not_aligned_asm
Ejemplo n.º 17
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    out_fpath = os.path.join(out_dirpath, assembly_name + '_glimmer')
    err_fpath = os.path.join(out_dirpath, assembly_name + '_glimmer.stderr')

    #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir,
    #    fasta_path, out_path, gene_lengths, err_path)

    out_gff_path, unique, total, cnt = glimmerHMM(tool_dirpath,
        contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index)

    if out_gff_path:
        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique) + ' unique, ' + str(total) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_path)

    return unique, cnt
Ejemplo n.º 18
0
def _handle_fasta(contigs_fpath, corr_fpath, reporting):
    lengths = fastaparser.get_lengths_from_fastafile(contigs_fpath)

    if not sum(l for l in lengths if l >= qconfig.min_contig):
        logger.warning("Skipping %s because it doesn't contain contigs >= %d bp."
                % (qutils.label_from_fpath(corr_fpath), qconfig.min_contig))
        return False

    # correcting
    if not correct_fasta(contigs_fpath, corr_fpath, qconfig.min_contig):
        return False

    ## filling column "Assembly" with names of assemblies
    report = reporting.get(corr_fpath)

    ## filling columns "Number of contigs >=110 bp", ">=200 bp", ">=500 bp"
    report.add_field(reporting.Fields.CONTIGS__FOR_THRESHOLDS,
                     [sum(1 for l in lengths if l >= threshold)
                      for threshold in qconfig.contig_thresholds])
    report.add_field(reporting.Fields.TOTALLENS__FOR_THRESHOLDS,
                     [sum(l for l in lengths if l >= threshold)
                      for threshold in qconfig.contig_thresholds])
    return True
Ejemplo n.º 19
0
def save_colors(results_dirpath,
                contigs_fpaths,
                dict_colors,
                meta=False):  # coordinates for Nx, NAx, NGx, NGAX
    from libs import plotter
    if meta:
        html_fpath = os.path.join(results_dirpath, report_fname)
        with open(html_fpath) as f_html:
            html_text = f_html.read()
        html_text = re.sub('{{ ' + 'colors' + ' }}', 'standard_colors',
                           html_text)
        with open(html_fpath, 'w') as f_html:
            f_html.write(html_text)
    else:
        colors_and_ls = [
            dict_colors[qutils.label_from_fpath(contigs_fpath)]
            for contigs_fpath in contigs_fpaths
        ]
        colors = [color_and_ls[0] for color_and_ls in colors_and_ls]
        colors_for_html = [
            html_colors[plotter.colors.index(color)] for color in colors
        ]
        json_fpath = json_saver.save_colors(results_dirpath, colors_for_html)
        append(results_dirpath, json_fpath, 'colors')
Ejemplo n.º 20
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.info("Running Basic statistics processor...")
    
    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC)
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info('  Estimated reference length = ' + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver
            html_saver.save_reference_length(results_dir, reference_length)

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, lists_of_lengths)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, lists_of_lengths)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath)
        list_of_GC_distributions.append(GC_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) )

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
        report.add_field(reporting.Fields.TOTALLEN, total_length)
        report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC else None))
        report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
        report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.draw_plots:
        import plotter
        ########################################################################import plotter
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length')
    
        ########################################################################
        # Drawing GC content plot...
        list_of_GC_distributions_with_ref = list_of_GC_distributions
        if ref_fpath:
            list_of_GC_distributions_with_ref.append(reference_GC_distribution)
        # Drawing cumulative plot...
        plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot')

        ########################################################################
        # Drawing Nx and NGx plots...
        plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [])
        if reference_length:
            plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))])

    logger.info('Done.')
Ejemplo n.º 21
0
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path,
               tmp_dir, index):
    def run(contig_path, tmp_path):
        with open(err_path, 'a') as err_file:
            return_code = qutils.call_subprocess([
                tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path
            ],
                                                 stdout=err_file,
                                                 stderr=err_file,
                                                 indent='  ' +
                                                 qutils.index_to_str(index) +
                                                 '  ')
            return return_code

    tool_exec = os.path.join(tool_dir, 'glimmerhmm')

    # Note: why arabidopsis? for no particular reason, really.
    trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis')

    contigs = {}
    gffs = []
    base_dir = tempfile.mkdtemp(dir=tmp_dir)
    for ind, seq in read_fasta(fasta_fpath):
        contig_path = os.path.join(base_dir, ind + '.fasta')
        gff_path = os.path.join(base_dir, ind + '.gff')

        write_fasta(contig_path, [(ind, seq)])
        if run(contig_path, gff_path) == 0:
            gffs.append(gff_path)
            contigs[ind] = seq

    if not gffs:
        logger.error(
            'Glimmer failed running Glimmer for %s. ' +
            ('Run with the --debug option'
             ' to see the command line.' if not qconfig.debug else '') %
            qutils.label_from_fpath(fasta_fpath))
        return None, None, None, None

    out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff')
    unique, total = set(), 0
    genes = []
    cnt = [0] * len(gene_lengths)
    for contig, gene_id, start, end, strand in parse_gff(out_gff_path):
        total += 1
        if strand == '+':
            gene_seq = contigs[contig][start:end + 1]
        else:
            gene_seq = rev_comp(contigs[contig][start:end + 1])
        if gene_seq not in unique:
            unique.add(gene_seq)
        genes.append((gene_id, gene_seq))
        for idx, gene_length in enumerate(gene_lengths):
            cnt[idx] += end - start > gene_length

    if OUTPUT_FASTA:
        out_fasta_path = out_fpath + '_genes.fasta'
        write_fasta(out_fasta_path, genes)
    if not qconfig.debug:
        shutil.rmtree(base_dir)

    #return out_gff_path, out_fasta_path, len(unique), total, cnt
    return out_gff_path, len(unique), total, cnt
Ejemplo n.º 22
0
Archivo: gage.py Proyecto: zjwang6/TS
def do(ref_fpath, contigs_fpaths, output_dirpath):
    gage_results_dirpath = os.path.join(output_dirpath, 'gage')

    # suffixes for files with report tables in plain text and tab separated formats
    if not os.path.isdir(gage_results_dirpath):
        os.mkdir(gage_results_dirpath)

    ########################################################################
    gage_tool_path = os.path.join(qconfig.LIBS_LOCATION, 'gage',
                                  'getCorrectnessStats.sh')

    ########################################################################
    logger.print_timestamp()
    logger.info('Running GAGE...')

    metrics = [
        'Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size',
        'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases',
        'Missing Assembly Contigs', 'Duplicated Reference Bases',
        'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs',
        'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation',
        'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50'
    ]
    metrics_in_reporting = [
        reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG,
        reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50,
        reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE,
        reporting.Fields.GAGE_CHAFFBASES,
        reporting.Fields.GAGE_MISSINGREFBASES,
        reporting.Fields.GAGE_MISSINGASMBLYBASES,
        reporting.Fields.GAGE_MISSINGASMBLYCONTIGS,
        reporting.Fields.GAGE_DUPREFBASES,
        reporting.Fields.GAGE_COMPRESSEDREFBASES,
        reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY,
        reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS,
        reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS,
        reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION,
        reporting.Fields.GAGE_NUMCORCONTIGS,
        reporting.Fields.GAGE_CORASMBLYSIZE,
        reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING,
        reporting.Fields.GAGE_CORN50
    ]

    tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp')
    if not os.path.exists(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    return_codes = Parallel(n_jobs=n_jobs)(
        delayed(run_gage)(i, contigs_fpath, gage_results_dirpath,
                          gage_tool_path, ref_fpath, tmp_dirpath)
        for i, contigs_fpath in enumerate(contigs_fpaths))

    if 0 not in return_codes:
        logger.warning('Error occurred while GAGE was processing assemblies.'
                       ' See GAGE error logs for details: %s' %
                       os.path.join(gage_results_dirpath, 'gage_*.stderr'))
        return

    ## find metrics for total report:
    for i, contigs_fpath in enumerate(contigs_fpaths):
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        report = reporting.get(contigs_fpath)

        log_out_fpath = os.path.join(gage_results_dirpath,
                                     'gage_' + assembly_name + '.stdout')
        logfile_out = open(log_out_fpath, 'r')
        cur_metric_id = 0
        for line in logfile_out:
            if metrics[cur_metric_id] in line:
                if (metrics[cur_metric_id].startswith('N50')):
                    report.add_field(
                        metrics_in_reporting[cur_metric_id],
                        line.split(metrics[cur_metric_id] + ':')[1].strip())
                else:
                    report.add_field(metrics_in_reporting[cur_metric_id],
                                     line.split(':')[1].strip())
                cur_metric_id += 1
                if cur_metric_id == len(metrics):
                    break
        logfile_out.close()

    reporting.save_gage(output_dirpath)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.info('Done.')
Ejemplo n.º 23
0
def js_data_gen(assemblies, contigs_fpaths, chr_names, chromosomes_length,
                output_dir_path, cov_fpath, ref_fpath, genome_size):
    chr_to_aligned_blocks = dict()
    for chr in chr_names:
        chr_init = []
        for fpath in contigs_fpaths:
            f = Alignment('FICTIVE', 0, 0, 0, 0, False, 0, 0, None)
            f.label = qutils.label_from_fpath(fpath)
            f.unshifted_start = 0
            f.unshifted_end = 0
            chr_init.append(f)
        chr_to_aligned_blocks.setdefault(chr, chr_init)
    for assembly in assemblies.assemblies:
        for align in assembly.alignments:
            chr_to_aligned_blocks[align.ref_name].append(align)

    summary_fname = 'alignment_summary.html'
    summary_path = os.path.join(output_dir_path, summary_fname)
    output_all_files_dir_path = os.path.join(output_dir_path,
                                             alignment_plots_dirname)
    if not os.path.exists(output_all_files_dir_path):
        os.mkdir(output_all_files_dir_path)
    import contigs_analyzer
    if contigs_analyzer.ref_labels_by_chromosomes:
        contig_names_by_refs = contigs_analyzer.ref_labels_by_chromosomes
        chr_full_names = list(
            set([contig_names_by_refs[contig] for contig in chr_names]))
    elif genome_size < MAX_SIZE_FOR_COMB_PLOT and len(
            chr_names) >= MIN_CONTIGS_FOR_COMB_PLOT:
        chr_full_names = [NAME_FOR_ONE_PLOT]
    else:
        chr_full_names = chr_names

    if cov_fpath:
        cov_data = dict()
        not_covered = dict()
        cur_len = dict()
        with open(cov_fpath, 'r') as coverage:
            name = chr_names[0]
            contig_to_chr = {}
            for chr in chr_full_names:
                cov_data.setdefault(chr, [])
                not_covered.setdefault(chr, [])
                cur_len.setdefault(chr, 0)
                if contigs_analyzer.ref_labels_by_chromosomes:
                    contigs = [
                        contig for contig in chr_names
                        if contig_names_by_refs[contig] == chr
                    ]
                elif chr == NAME_FOR_ONE_PLOT:
                    contigs = chr_names
                else:
                    contigs = [chr]
                for contig in contigs:
                    contig_to_chr[contig] = chr
            for index, line in enumerate(coverage):
                c = list(line.split())
                name = contig_to_chr[qutils.correct_name(c[0])]
                cur_len[name] += int(c[2])
                if index % 100 == 0 and index > 0:
                    cov_data[name].append(cur_len[name] / 100)
                    cur_len[name] = 0
                if c[2] == '0':
                    not_covered[name].append(c[1])
    chr_sizes = {}
    num_contigs = {}
    aligned_bases = genome_analyzer.get_ref_aligned_lengths()
    aligned_bases_by_chr = {}
    num_misassemblies = {}
    aligned_assemblies = {}

    for i, chr in enumerate(chr_full_names):
        short_chr = chr[:30]
        num_misassemblies[chr] = 0
        aligned_bases_by_chr[chr] = []
        aligned_assemblies[chr] = []
        with open(
                os.path.join(output_all_files_dir_path,
                             'data_%s.js' % short_chr), 'w') as result:
            result.write('"use strict";\n')
            if contigs_analyzer.ref_labels_by_chromosomes:
                contigs = [
                    contig for contig in chr_names
                    if contig_names_by_refs[contig] == chr
                ]
                result.write('var links_to_chromosomes = {};\n')
                links_to_chromosomes = []
                used_chromosomes = []
            elif chr == NAME_FOR_ONE_PLOT:
                contigs = chr_names
            else:
                contigs = [chr]
            chr_size = sum([chromosomes_length[contig] for contig in contigs])
            chr_sizes[chr] = chr_size
            num_contigs[chr] = len(contigs)
            for contig in contigs:
                aligned_bases_by_chr[chr].extend(aligned_bases[contig])
            data_str = 'var chromosomes_len = {};\n'
            for contig in contigs:
                l = chromosomes_length[contig]
                data_str += 'chromosomes_len["{contig}"] = {l};\n'.format(
                    **locals())
            result.write(data_str)

            # adding assembly data
            data_str = 'var contig_data = {};\n'
            data_str += 'contig_data["{chr}"] = [ '.format(**locals())
            prev_len = 0
            chr_lengths = [0] + [
                chromosomes_length[contig] for contig in contigs
            ]
            for num_contig, contig in enumerate(contigs):
                if num_contig > 0:
                    prev_len += chr_lengths[num_contig]
                if len(chr_to_aligned_blocks[contig]) > 0:
                    for alignment in chr_to_aligned_blocks[contig]:
                        if alignment.misassembled:
                            num_misassemblies[chr] += 1
                        corr_start = prev_len + alignment.unshifted_start
                        corr_end = prev_len + alignment.unshifted_end
                        data_str += '{{name: "{alignment.name}", corr_start: {corr_start}, corr_end: {corr_end},' \
                                    'start: {alignment.unshifted_start}, end: {alignment.unshifted_end}, assembly: "{alignment.label}", similar: "{alignment.similar}", misassembled: "{alignment.misassembled}" '.format(**locals())
                        if alignment.name != 'FICTIVE':
                            if len(aligned_assemblies[chr]) < len(
                                    contigs_fpaths
                            ) and alignment.label not in aligned_assemblies[
                                    chr]:
                                aligned_assemblies[chr].append(alignment.label)
                            data_str += ', structure: ['
                            for el in alignment.misassembled_structure:
                                if type(el) == list:
                                    if el[5] in contigs:
                                        num_chr = contigs.index(el[5])
                                        corr_len = sum(chr_lengths[:num_chr +
                                                                   1])
                                    else:
                                        corr_len = -int(el[1])
                                        if contigs_analyzer.ref_labels_by_chromosomes and el[
                                                5] not in used_chromosomes:
                                            used_chromosomes.append(el[5])
                                            new_chr = contig_names_by_refs[
                                                el[5]]
                                            links_to_chromosomes.append(
                                                'links_to_chromosomes["{el[5]}"] = "{new_chr}";\n'
                                                .format(**locals()))
                                    corr_start = corr_len + int(el[0])
                                    corr_end = corr_len + int(el[1])
                                    data_str += '{{type: "A", corr_start: {corr_start}, corr_end: {corr_end}, start: {el[0]}, end: {el[1]}, start_in_contig: {el[2]}, end_in_contig: {el[3]}, IDY: {el[4]}, chr: "{el[5]}"}},'.format(
                                        **locals())
                                elif type(el) == str:
                                    data_str += '{{type: "M", mstype: "{el}"}},'.format(
                                        **locals())
                            if data_str[-1] == '[':
                                data_str = data_str + ']},'
                            else:
                                data_str = data_str[:-1] + ']},'
                        else:
                            data_str += '},'
            data_str = data_str[:-1] + '];\n\n'
            result.write(data_str)
            if contigs_analyzer.ref_labels_by_chromosomes:
                result.write(''.join(links_to_chromosomes))
            if cov_fpath:
                # adding coverage data
                data_str = 'var coverage_data = {};\n'
                if cov_data[chr]:
                    data_str += 'coverage_data["{chr}"] = [ '.format(
                        **locals())
                    for e in cov_data[chr]:
                        data_str += '{e},'.format(**locals())
                        if len(data_str) > 10000 and e != cov_data[chr][-1]:
                            result.write(data_str)
                            data_str = ''
                    data_str = data_str[:-1] + '];\n'
                    result.write(data_str)
                    data_str = ''

                data_str = 'var not_covered = {};\n'
                data_str += 'not_covered["{chr}"] = [ '.format(**locals())
                if len(not_covered[chr]) > 0:
                    for e in not_covered[chr]:
                        data_str += '{e},'.format(**locals())
                        if len(data_str) > 10000 and e != cov_data[chr][-1]:
                            result.write(data_str)
                            data_str = ''
                    data_str = data_str[:-1]
                data_str += '];\n'
                result.write(data_str)
                data_str = ''

            with open(html_saver.get_real_path('_chr_templ.html'),
                      'r') as template:
                with open(
                        os.path.join(output_all_files_dir_path,
                                     '_{short_chr}.html'.format(**locals())),
                        'w') as result:
                    for line in template:
                        if line.find(
                                '<script type="text/javascript" src=""></script>'
                        ) != -1:
                            result.write(
                                '<script type="text/javascript" src="data_{short_chr}.js"></script>\n'
                                .format(**locals()))
                        else:
                            result.write(line)
                            if line.find('<body>') != -1:
                                chr_size = chr_sizes[chr]
                                chr_name = chr.replace('_', ' ')
                                if len(chr_name) > 50:
                                    chr_name = chr_name[:50] + '...'
                                title = 'CONTIG ALIGNMENT BROWSER: %s (' % chr_name + (
                                    '%s fragments, ' % num_contigs[chr]
                                    if num_contigs[chr] > 1 else ''
                                ) + '%s bp)' % format_long_numbers(chr_size)
                                result.write(
                                    '<div class = "block title"><a href="../{summary_fname}"><button class="back_button">&crarr;</button></a>{title}</div>\n'
                                    .format(**locals()))
                            if line.find(
                                    '<script type="text/javascript">') != -1:
                                chromosome = '","'.join(contigs)
                                result.write(
                                    'var CHROMOSOME = "{chr}";\n'.format(
                                        **locals()))
                                result.write(
                                    'var chrContigs = ["{chromosome}"];\n'.
                                    format(**locals()))

    with open(html_saver.get_real_path('alignment_summary_templ.html'),
              'r') as template:
        with open(summary_path, 'w') as result:
            num_aligned_assemblies = [
                len(aligned_assemblies[chr]) for chr in chr_full_names
            ]
            is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1
            for line in template:
                result.write(line)
                if line.find('<!--- assemblies: ---->') != -1:
                    if not is_unaligned_asm_exists:
                        result.write(
                            '<div class="subtitle"># assemblies: %s</div>' %
                            len(contigs_fpaths))
                if line.find('<!--- th_assemblies: ---->') != -1:
                    if is_unaligned_asm_exists:
                        result.write('<th># assemblies</th>')
                if line.find('<!--- references: ---->') != -1:
                    for chr in sorted(chr_full_names):
                        result.write('<tr>')
                        short_chr = chr[:30]
                        chr_link = os.path.join(
                            alignment_plots_dirname,
                            '_{short_chr}.html'.format(**locals()))
                        chr_name = chr.replace('_', ' ')
                        aligned_lengths = [
                            aligned_len
                            for aligned_len in aligned_bases_by_chr[chr]
                            if aligned_len is not None
                        ]
                        chr_genome = sum(aligned_lengths) * 100.0 / (
                            chr_sizes[chr] * len(contigs_fpaths))
                        chr_size = chr_sizes[chr]
                        result.write('<td><a href="%s">%s</a></td>' %
                                     (chr_link, chr_name))
                        result.write('<td>%s</td>' % num_contigs[chr])
                        result.write('<td>%s</td>' %
                                     format_long_numbers(chr_size))
                        if is_unaligned_asm_exists:
                            result.write('<td>%s</td>' %
                                         len(aligned_assemblies[chr]))
                        result.write('<td>%.3f</td>' % chr_genome)
                        result.write('<td>%s</td>' % num_misassemblies[chr])
                        result.write('</tr>')

    copyfile(
        html_saver.get_real_path(
            os.path.join('static', 'contig_alignment_plot.css')),
        os.path.join(output_all_files_dir_path, 'contig_alignment_plot.css'))
    copyfile(html_saver.get_real_path(os.path.join('static', 'd3.js')),
             os.path.join(output_all_files_dir_path, 'd3.js'))
    copyfile(
        html_saver.get_real_path(
            os.path.join('static', 'scripts',
                         'contig_alignment_plot_script.js')),
        os.path.join(output_all_files_dir_path,
                     'contig_alignment_plot_script.js'))
Ejemplo n.º 24
0
def main(args):
    if ' ' in qconfig.QUAST_HOME:
        logger.error(
            'QUAST does not support spaces in paths. \n'
            'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n'
            'Please, put QUAST in a different directory, then try again.\n',
            to_stderr=True,
            exit_with_code=3)

    if not args:
        qconfig.usage()
        sys.exit(0)

    reload(qconfig)

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args,
                                                    qconfig.short_options,
                                                    qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage()
        sys.exit(2)

    for opt, arg in options[:]:

        if opt == '--test' or opt == '--test-sv':
            options.remove((opt, arg))
            options += [
                ('-o', 'quast_test_output'),
                ('-R',
                 os.path.join(qconfig.QUAST_HOME, 'test_data',
                              'reference.fasta.gz')),  # for compiling MUMmer
                ('-O',
                 os.path.join(qconfig.QUAST_HOME, 'test_data', 'operons.gff')),
                ('-G',
                 os.path.join(qconfig.QUAST_HOME, 'test_data', 'genes.gff')),
                ('--gage', ''),  # for compiling GAGE Java classes
                ('--gene-finding', ''),
                ('--eukaryote', ''),
                ('--glimmer', '')
            ]  # for compiling GlimmerHMM
            if opt == '--test-sv':
                options += [('-1',
                             os.path.join(qconfig.QUAST_HOME, 'test_data',
                                          'reads1.fastq.gz')),
                            ('-2',
                             os.path.join(qconfig.QUAST_HOME, 'test_data',
                                          'reads2.fastq.gz'))]
            contigs_fpaths += [
                os.path.join(qconfig.QUAST_HOME, 'test_data',
                             'contigs_1.fasta'),
                os.path.join(qconfig.QUAST_HOME, 'test_data',
                             'contigs_2.fasta')
            ]
            qconfig.test = True

        if opt.startswith('--help') or opt == '-h':
            qconfig.usage(opt == "--help-hidden", short=False)
            sys.exit(0)

        elif opt.startswith('--version') or opt == '-v':
            qconfig.print_version()
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage()
        sys.exit(2)

    json_output_dirpath = None
    output_dirpath = None

    labels = None
    all_labels_from_dirs = False
    qconfig.is_combined_ref = False

    ref_fpath = ''
    genes_fpaths = []
    operons_fpaths = []
    bed_fpath = None
    reads_fpath_f = ''
    reads_fpath_r = ''

    # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7.
    for opt, arg in options:
        if opt in ('-d', '--debug'):
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        elif opt in ('-o', "--output-dir"):
            output_dirpath = os.path.abspath(arg)
            qconfig.make_latest_symlink = False
            if ' ' in output_dirpath:
                logger.error('QUAST does not support spaces in paths. \n'
                             'You have specified ' + str(output_dirpath) +
                             ' as an output path.\n'
                             'Please, use a different directory.\n',
                             to_stderr=True,
                             exit_with_code=3)

        elif opt in ('-G', "--genes"):
            genes_fpaths.append(assert_file_exists(arg, 'genes'))

        elif opt in ('-O', "--operons"):
            operons_fpaths.append(assert_file_exists(arg, 'operons'))

        elif opt in ('-R', "--reference"):
            ref_fpath = assert_file_exists(arg, 'reference')

        elif opt == "--contig-thresholds":
            qconfig.contig_thresholds = arg

        elif opt in ('-m', "--min-contig"):
            qconfig.min_contig = int(arg)

        elif opt in ('-t', "--threads"):
            qconfig.max_threads = int(arg)
            if qconfig.max_threads < 1:
                qconfig.max_threads = 1

        elif opt in ('-c', "--min-cluster"):
            qconfig.min_cluster = int(arg)

        elif opt in ('-i', "--min-alignment"):
            qconfig.min_alignment = int(arg)

        elif opt == "--est-ref-size":
            qconfig.estimated_reference_size = int(arg)

        elif opt == "--gene-thresholds":
            qconfig.genes_lengths = arg

        elif opt in ('-j', '--save-json'):
            qconfig.save_json = True

        elif opt in ('-J', '--save-json-to'):
            qconfig.save_json = True
            qconfig.make_latest_symlink = False
            json_output_dirpath = arg

        elif opt == '--err-fpath':  # for web-quast
            qconfig.save_error = True
            qconfig.error_log_fname = arg

        elif opt in ('-s', "--scaffolds"):
            qconfig.scaffolds = True

        elif opt == "--gage":
            qconfig.with_gage = True

        elif opt in ('-e', "--eukaryote"):
            qconfig.prokaryote = False

        elif opt in ('-f', "--gene-finding"):
            qconfig.gene_finding = True

        elif opt in ('-a', "--ambiguity-usage"):
            if arg in ["none", "one", "all"]:
                qconfig.ambiguity_usage = arg

        elif opt in ('-u', "--use-all-alignments"):
            qconfig.use_all_alignments = True

        elif opt == "--strict-NA":
            qconfig.strict_NA = True

        elif opt in ('-x', "--extensive-mis-size"):
            if int(arg) <= qconfig.MAX_INDEL_LENGTH:
                logger.error(
                    "--extensive-mis-size should be greater than maximum indel length (%d)!"
                    % qconfig.MAX_INDEL_LENGTH,
                    1,
                    to_stderr=True)
            qconfig.extensive_misassembly_threshold = int(arg)

        elif opt == '--no-snps':
            qconfig.show_snps = False

        elif opt == '--no-plots':
            qconfig.draw_plots = False

        elif opt == '--no-html':
            qconfig.html_report = False

        elif opt == '--no-check':
            qconfig.no_check = True

        elif opt == '--no-gc':
            qconfig.no_gc = True

        elif opt == '--fast':  # --no-gc, --no-plots, --no-snps
            #qconfig.no_check = True  # too risky to include
            qconfig.no_gc = True
            qconfig.show_snps = False
            qconfig.draw_plots = False
            qconfig.html_report = False

        elif opt == '--plots-format':
            if arg.lower() in qconfig.supported_plot_extensions:
                qconfig.plot_extension = arg.lower()
            else:
                logger.error(
                    'Format "%s" is not supported. Please, use one of the supported formats: %s.'
                    % (arg, ', '.join(qconfig.supported_plot_extensions)),
                    to_stderr=True,
                    exit_with_code=2)

        elif opt == '--meta':
            qconfig.meta = True

        elif opt == '--no-check-meta':
            qconfig.no_check = True
            qconfig.no_check_meta = True

        elif opt == '--references-list':
            pass

        elif opt in ('-l', '--labels'):
            labels = parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            all_labels_from_dirs = True

        elif opt == '--glimmer':
            qconfig.glimmer = True

        elif opt == '--combined-ref':
            qconfig.is_combined_ref = True

        elif opt == '--memory-efficient':
            qconfig.memory_efficient = True

        elif opt == '--silent':
            qconfig.silent = True

        elif opt in ('-1', '--reads1'):
            reads_fpath_f = arg
        elif opt in ('-2', '--reads2'):
            reads_fpath_r = arg
        elif opt == '--bed-file':
            bed_fpath = arg

        elif opt == '--contig-alignment-html':
            qconfig.create_contig_alignment_html = True
        else:
            logger.error('Unknown option: %s. Use -h for help.' %
                         (opt + ' ' + arg),
                         to_stderr=True,
                         exit_with_code=2)

    for contigs_fpath in contigs_fpaths:
        assert_file_exists(contigs_fpath, 'contigs')

    labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    output_dirpath, json_output_dirpath, existing_alignments = \
        _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    args = [os.path.realpath(__file__)]
    for k, v in options:
        args.extend([k, v])
    args.extend(contigs_fpaths)
    logger.print_command_line(args, wrap_after=None, is_main=True)
    logger.start()

    if existing_alignments:
        logger.main_info()
        logger.notice(
            "Output directory already exists. Existing Nucmer alignments can be used."
        )
        qutils.remove_reports(output_dirpath)

    if qconfig.contig_thresholds == "None":
        qconfig.contig_thresholds = []
    else:
        qconfig.contig_thresholds = map(int,
                                        qconfig.contig_thresholds.split(","))
    if qconfig.genes_lengths == "None":
        qconfig.genes_lengths = []
    else:
        qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(","))

    qconfig.set_max_threads(logger)

    logger.main_info()
    logger.print_params()

    ########################################################################
    from libs import reporting
    reload(reporting)

    if qconfig.is_combined_ref:
        corrected_dirpath = os.path.join(output_dirpath, '..',
                                         qconfig.corrected_dirname)
    else:
        if os.path.isdir(corrected_dirpath):
            shutil.rmtree(corrected_dirpath)
        os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCE
    if ref_fpath:
        logger.main_info()
        logger.main_info('Reference:')
        ref_fpath = _correct_reference(ref_fpath, corrected_dirpath)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')

    contigs_fpaths, old_contigs_fpaths = _correct_contigs(
        contigs_fpaths, corrected_dirpath, reporting, labels)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME,
                         qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    reads_fpaths = []
    if reads_fpath_f:
        reads_fpaths.append(reads_fpath_f)
    if reads_fpath_r:
        reads_fpaths.append(reads_fpath_r)
    if reads_fpaths:
        bed_fpath = reads_analyzer.do(ref_fpath,
                                      contigs_fpaths,
                                      reads_fpaths,
                                      None,
                                      os.path.join(output_dirpath,
                                                   qconfig.variation_dirname),
                                      external_logger=logger)

    if not contigs_fpaths:
        logger.error(
            "None of the assembly files contains correct contigs. "
            "Please, provide different files or decrease --min-contig threshold.",
            fake_if_nested_run=True)
        return 4

    qconfig.assemblies_fpaths = contigs_fpaths
    if qconfig.with_gage:
        ########################################################################
        ### GAGE
        ########################################################################
        if not ref_fpath:
            logger.warning(
                "GAGE can't be run without a reference and will be skipped.")
        else:
            from libs import gage
            gage.do(ref_fpath, contigs_fpaths, output_dirpath)

    # Where all pdfs will be saved
    all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)
    all_pdf_file = None

    if qconfig.draw_plots or qconfig.html_report:
        from libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.
        try:
            from matplotlib.backends.backend_pdf import PdfPages
            all_pdf_file = PdfPages(all_pdf_fpath)
        except:
            all_pdf_file = None

    if json_output_dirpath:
        from libs.html_saver import json_saver
        if json_saver.simplejson_error:
            json_output_dirpath = None

    ########################################################################
    ### Stats and plots
    ########################################################################
    from libs import basic_stats
    basic_stats.do(ref_fpath, contigs_fpaths,
                   os.path.join(output_dirpath, 'basic_stats'),
                   json_output_dirpath, output_dirpath)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from libs import contigs_analyzer
        nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath, contigs_fpaths, qconfig.prokaryote,
            os.path.join(output_dirpath, 'contigs_reports'),
            old_contigs_fpaths, bed_fpath)
        for contigs_fpath in contigs_fpaths:
            if nucmer_statuses[
                    contigs_fpath] == contigs_analyzer.NucmerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(
                    aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(
            output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from libs import aligned_stats
        aligned_stats.do(ref_fpath, aligned_contigs_fpaths, output_dirpath,
                         json_output_dirpath, aligned_lengths_lists,
                         os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from libs import genome_analyzer
        genome_analyzer.do(ref_fpath, aligned_contigs_fpaths, output_dirpath,
                           json_output_dirpath, genes_fpaths, operons_fpaths,
                           detailed_contigs_reports_dirpath,
                           os.path.join(output_dirpath, 'genome_stats'))

    if qconfig.gene_finding or qconfig.glimmer:
        if qconfig.glimmer:
            ########################################################################
            ### Glimmer
            ########################################################################
            from libs import glimmer
            glimmer.do(contigs_fpaths, qconfig.genes_lengths,
                       os.path.join(output_dirpath, 'predicted_genes'))
        else:
            ########################################################################
            ### GeneMark
            ########################################################################
            from libs import genemark
            genemark.do(contigs_fpaths, qconfig.genes_lengths,
                        os.path.join(output_dirpath, 'predicted_genes'),
                        qconfig.prokaryote, qconfig.meta)

    else:
        logger.main_info("")
        logger.notice(
            "Genes are not predicted by default. Use --gene-finding option to enable it."
        )
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(
        output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots:
        logger.print_timestamp()
        logger.main_info('Drawing large plots...')
        logger.main_info(
            'This may take a while: press Ctrl-C to skip this step..')
        try:
            if detailed_contigs_reports_dirpath and qconfig.show_snps:
                contig_report_fpath_pattern = os.path.join(
                    detailed_contigs_reports_dirpath,
                    'contigs_report_%s.stdout')
            else:
                contig_report_fpath_pattern = None
            number_of_steps = sum([
                int(bool(value))
                for value in [contig_report_fpath_pattern, all_pdf_file]
            ])
            if contig_report_fpath_pattern:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.main_info(
                    '  1 of %d: Creating contig alignment plot...' %
                    number_of_steps)
                from libs import contig_alignment_plotter
                contig_alignment_plot_fpath = contig_alignment_plotter.do(
                    contigs_fpaths,
                    contig_report_fpath_pattern,
                    output_dirpath,
                    ref_fpath,
                    similar=True)

            if all_pdf_file:
                # full report in PDF format: all tables and plots
                logger.main_info(
                    '  %d of %d: Creating PDF with all tables and plots...' %
                    (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_file)
            logger.main_info('Done')
        except KeyboardInterrupt:
            logger.main_info('..step skipped!')
            os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.main_info('RESULTS:')
    logger.main_info('  Text versions of total report are saved to ' +
                     reports_fpaths)
    logger.main_info(
        '  Text versions of transposed total report are saved to ' +
        transposed_reports_fpaths)

    if json_output_dirpath:
        json_saver.save_total_report(json_output_dirpath, qconfig.min_contig,
                                     ref_fpath)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_colors(output_dirpath, contigs_fpaths,
                               plotter.dict_color_and_ls)
        html_saver.save_total_report(output_dirpath, qconfig.min_contig,
                                     ref_fpath)

    if os.path.isfile(all_pdf_fpath):
        logger.main_info('  PDF version (tables and plots) saved to ' +
                         all_pdf_fpath)

    if contig_alignment_plot_fpath:
        logger.main_info('  Contig alignment plot: %s' %
                         contig_alignment_plot_fpath)

    _cleanup(corrected_dirpath)
    logger.finish_up(check_test=qconfig.test)
    return 0
Ejemplo n.º 25
0
def do(ref_fpath, contigs_fpaths, output_dirpath):
    gage_results_dirpath = os.path.join(output_dirpath, 'gage')

    # suffixes for files with report tables in plain text and tab separated formats
    if not os.path.isdir(gage_results_dirpath):
        os.mkdir(gage_results_dirpath)

    ########################################################################
    gage_tool_path = os.path.join(qconfig.LIBS_LOCATION, 'gage', 'getCorrectnessStats.sh')

    ########################################################################
    logger.print_timestamp()
    logger.info('Running GAGE...')

    metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases',
               'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs',
               'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp',
               'Indels >= 5', 'Inversions', 'Relocation', 'Translocation',
               'Total units', 'BasesInFasta', 'Min', 'Max', 'N50']
    metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, 
                            reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE,
                            reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, 
                            reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, 
                            reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, 
                            reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, 
                            reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, 
                            reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, 
                            reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50]

    tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp')
    if not os.path.exists(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath)
        for i, contigs_fpath in enumerate(contigs_fpaths))

    if 0 not in return_codes:
        logger.warning('Error occurred while GAGE was processing assemblies.'
                       ' See GAGE error logs for details: %s' %
                os.path.join(gage_results_dirpath, 'gage_*.stderr'))
        return

    ## find metrics for total report:
    for i, contigs_fpath in enumerate(contigs_fpaths):
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        report = reporting.get(contigs_fpath)

        log_out_fpath = os.path.join(
            gage_results_dirpath, 'gage_' + assembly_name + '.stdout')
        logfile_out = open(log_out_fpath, 'r')
        cur_metric_id = 0
        for line in logfile_out:
            if metrics[cur_metric_id] in line:
                if (metrics[cur_metric_id].startswith('N50')):
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip())
                else:
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip())
                cur_metric_id += 1
                if cur_metric_id == len(metrics):
                    break
        logfile_out.close()

    reporting.save_gage(output_dirpath)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.info('Done.')
Ejemplo n.º 26
0
def main(args):
    if ' ' in quast_dirpath:
        logger.error('QUAST does not support spaces in paths. \n'
                     'You are trying to run it from ' + str(quast_dirpath) + '\n'
                     'Please, put QUAST in a different directory, then try again.\n',
                     to_stderr=True,
                     exit_with_code=3)

    if not args:
        qconfig.usage()
        sys.exit(0)

    reload(qconfig)

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage()
        sys.exit(2)

    for opt, arg in options[:]:
        if opt in ('-d', '--debug'):
            options.remove((opt, arg))
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        if opt == '--test':
            options.remove((opt, arg))
            options += [('-o', 'quast_test_output'),
                        ('-R', 'test_data/reference.fasta.gz'),   # for compiling MUMmer
                        ('-O', 'test_data/operons.gff'),
                        ('-G', 'test_data/genes.gff'),
                        ('--gene-finding',''), ('--eukaryote','')] # for compiling GlimmerHMM
            contigs_fpaths += ['test_data/contigs_1.fasta',
                               'test_data/contigs_2.fasta']
            qconfig.test = True

        if opt.startswith('--help'):
            qconfig.usage(opt == "--help-hidden")
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage()
        sys.exit(2)

    json_output_dirpath = None
    output_dirpath = None

    labels = None
    all_labels_from_dirs = False

    ref_fpath = ''
    genes_fpaths = []
    operons_fpaths = []

    # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7.
    for opt, arg in options:
        if opt in ('-o', "--output-dir"):
            output_dirpath = os.path.abspath(arg)
            qconfig.make_latest_symlink = False

        elif opt in ('-G', "--genes"):
            genes_fpaths.append(assert_file_exists(arg, 'genes'))

        elif opt in ('-O', "--operons"):
            operons_fpaths.append(assert_file_exists(arg, 'operons'))

        elif opt in ('-R', "--reference"):
            ref_fpath = assert_file_exists(arg, 'reference')

        elif opt in ('-t', "--contig-thresholds"):
            qconfig.contig_thresholds = arg

        elif opt in ('-M', "--min-contig"):
            qconfig.min_contig = int(arg)

        elif opt in ('-T', "--threads"):
            qconfig.max_threads = int(arg)
            if qconfig.max_threads < 1:
                qconfig.max_threads = 1

        elif opt in ('-c', "--mincluster"):
            qconfig.mincluster = int(arg)

        elif opt == "--est-ref-size":
            qconfig.estimated_reference_size = int(arg)

        elif opt in ('-S', "--gene-thresholds"):
            qconfig.genes_lengths = arg

        elif opt in ('-j', '--save-json'):
            qconfig.save_json = True

        elif opt in ('-J', '--save-json-to'):
            qconfig.save_json = True
            qconfig.make_latest_symlink = False
            json_output_dirpath = arg

        elif opt in ('-s', "--scaffolds"):
            qconfig.scaffolds = True

        elif opt == "--gage":
            qconfig.with_gage = True

        elif opt in ('-e', "--eukaryote"):
            qconfig.prokaryote = False

        elif opt in ('-f', "--gene-finding"):
            qconfig.gene_finding = True

        elif opt in ('-a', "--ambiguity-usage"):
            if arg in ["none", "one", "all"]:
                qconfig.ambiguity_usage = arg

        elif opt in ('-u', "--use-all-alignments"):
            qconfig.use_all_alignments = True

        elif opt in ('-n', "--strict-NA"):
            qconfig.strict_NA = True

        elif opt == '--no-plots':
            qconfig.draw_plots = False

        elif opt == '--no-html':
            qconfig.html_report = False

        elif opt in ('-m', '--meta'):
            qconfig.meta = True

        elif opt in ('-l', '--labels'):
            labels = parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            all_labels_from_dirs = True
        else:
            logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2)

    for contigs_fpath in contigs_fpaths:
        assert_file_exists(contigs_fpath, 'contigs')

    labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    output_dirpath, json_output_dirpath, existing_alignments = \
        _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    logger.print_command_line([os.path.realpath(__file__)] + args, wrap_after=None)
    logger.start()

    if existing_alignments:
        logger.info()
        logger.notice("Output directory already exists. Existing Nucmer alignments can be used.")
        qutils.remove_reports(output_dirpath)

    if qconfig.contig_thresholds == "None":
        qconfig.contig_thresholds = []
    else:
        qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(","))
    if qconfig.genes_lengths == "None":
        qconfig.genes_lengths = []
    else:
        qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(","))

    # Threading
    if qconfig.max_threads is None:
        try:
            import multiprocessing
            qconfig.max_threads = multiprocessing.cpu_count()
        except:
            logger.warning('Failed to determine the number of CPUs')
            qconfig.max_threads = qconfig.DEFAULT_MAX_THREADS

        logger.info()
        logger.notice('Maximum number of threads is set to ' + str(qconfig.max_threads) + ' (use --threads option to set it manually)')


    ########################################################################
    from libs import reporting
    reload(reporting)

    if os.path.isdir(corrected_dirpath):
        shutil.rmtree(corrected_dirpath)
    os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCE
    if ref_fpath:
        logger.info()
        logger.info('Reference:')
        ref_fpath = _correct_reference(ref_fpath, corrected_dirpath)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.info()
    logger.info('Contigs:')
    contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    if not contigs_fpaths:
        logger.error("None of the assembly files contains correct contigs. "
              "Please, provide different files or decrease --min-contig threshold.",
              fake_if_nested_run=True)
        return 4

    if qconfig.with_gage:
        ########################################################################
        ### GAGE
        ########################################################################
        if not ref_fpath:
            logger.warning("GAGE can't be run without a reference and will be skipped.")
        else:
            from libs import gage
            gage.do(ref_fpath, contigs_fpaths, output_dirpath)

    # Where all pdfs will be saved
    all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)
    all_pdf_file = None

    if qconfig.draw_plots:
        from libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.
        try:
            from matplotlib.backends.backend_pdf import PdfPages
            all_pdf_file = PdfPages(all_pdf_fpath)
        except:
            all_pdf_file = None

    ########################################################################
    ### Stats and plots
    ########################################################################
    from libs import basic_stats
    basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'),
                   json_output_dirpath, output_dirpath)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from libs import contigs_analyzer
        nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports'))
        for contigs_fpath in contigs_fpaths:
            if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from libs import aligned_stats
        aligned_stats.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from libs import genome_analyzer
        genome_analyzer.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats'))

    if qconfig.gene_finding:
        if qconfig.prokaryote or qconfig.meta:
            ########################################################################
            ### GeneMark
            ########################################################################
            from libs import genemark
            genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'),
                        qconfig.meta)
        else:
            ########################################################################
            ### Glimmer
            ########################################################################
            from libs import glimmer
            glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'))
    else:
        logger.info("")
        logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.")
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots:
        logger.print_timestamp()
        logger.info('Drawing large plots...')
        logger.info('This may take a while: press Ctrl-C to skip this step..')
        try:
            number_of_steps = sum([int(bool(value)) for value in [detailed_contigs_reports_dirpath, all_pdf_file]])
            if detailed_contigs_reports_dirpath:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.info('  1 of %d: Creating contig alignment plot...' % number_of_steps)
                from libs import contig_alignment_plotter
                contig_alignment_plot_fpath = contig_alignment_plotter.do(
                    contigs_fpaths, os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout'),
                    output_dirpath, ref_fpath, similar=True)

            if all_pdf_file:
                # full report in PDF format: all tables and plots
                logger.info('  %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_file)
            logger.info('Done')
        except KeyboardInterrupt:
            logger.info('..step skipped!')
            os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.info('RESULTS:')
    logger.info('  Text versions of total report are saved to ' + reports_fpaths)
    logger.info('  Text versions of transposed total report are saved to ' + transposed_reports_fpaths)

    if json_output_dirpath:
        json_saver.save_total_report(json_output_dirpath, qconfig.min_contig)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_total_report(output_dirpath, qconfig.min_contig)

    if os.path.isfile(all_pdf_fpath):
        logger.info('  PDF version (tables and plots) saved to ' + all_pdf_fpath)

    if contig_alignment_plot_fpath:
        logger.info('  Contig alignment plot: %s' % contig_alignment_plot_fpath)

    _cleanup(corrected_dirpath)
    logger.finish_up(check_test=qconfig.test)
    return 0
Ejemplo n.º 27
0
def get(assembly_fpath):
    if assembly_fpath not in assembly_fpaths:
        assembly_fpaths.append(assembly_fpath)
    return reports.setdefault(assembly_fpath, Report(qutils.label_from_fpath(assembly_fpath)))
Ejemplo n.º 28
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running NA-NGA calculation...')

    reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(
            sum(fastaparser.get_lengths_from_fastafile(contigs_fpath)))

    import N50
    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists,
                           assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        na75 = N50.NG50(lens, assembly_len, 75)
        la50 = N50.LG50(lens, assembly_len)
        la75 = N50.LG50(lens, assembly_len, 75)
        if not qconfig.is_combined_ref:
            nga50 = N50.NG50(lens, reference_length)
            nga75 = N50.NG50(lens, reference_length, 75)
            lga50 = N50.LG50(lens, reference_length)
            lga75 = N50.LG50(lens, reference_length, 75)

        logger.info(
            '  ' + qutils.index_to_str(i) +
            qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' +
            str(max(lens)) + ', NA50 = ' + str(na50) +
            (', NGA50 = ' +
             str(nga50) if not qconfig.is_combined_ref and nga50 else '') +
            ', LA50 = ' + str(la50) +
            (', LGA50 = ' +
             str(lga50) if not qconfig.is_combined_ref and lga50 else ''))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LA75, la75)
        if not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NGA50, nga50)
            report.add_field(reporting.Fields.NGA75, nga75)
            report.add_field(reporting.Fields.LGA50, lga50)
            report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    num_contigs = max([
        len(aligned_lengths_lists[i])
        for i in range(len(aligned_lengths_lists))
    ])

    if json_output_dirpath:
        from libs.html_saver import json_saver
        json_saver.save_assembly_lengths(json_output_dirpath,
                                         aligned_contigs_fpaths,
                                         assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_assembly_lengths(output_dirpath,
                                         aligned_contigs_fpaths,
                                         assembly_lengths)

    import plotter
    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        plotter.cumulative_plot(
            ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
            os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
            'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
    plotter.Nx_plot(output_dirpath,
                    num_contigs > qconfig.max_points,
                    aligned_contigs_fpaths,
                    aligned_lengths_lists,
                    aligned_stats_dirpath + '/NAx_plot',
                    'NAx',
                    assembly_lengths,
                    json_output_dir=json_output_dirpath)
    if not qconfig.is_combined_ref:
        plotter.Nx_plot(
            output_dirpath,
            num_contigs > qconfig.max_points,
            aligned_contigs_fpaths,
            aligned_lengths_lists,
            aligned_stats_dirpath + '/NGAx_plot',
            'NGAx',
            [reference_length for i in range(len(aligned_contigs_fpaths))],
            json_output_dir=json_output_dirpath)

    logger.main_info('Done.')
    return report_dict
Ejemplo n.º 29
0
def main(args):
    if ' ' in quast_dirpath:
        logger.error('QUAST does not support spaces in paths. \n'
                     'You are trying to run it from ' + str(quast_dirpath) + '\n'
                     'Please, put QUAST in a different directory, then try again.\n',
                     to_stderr=True,
                     exit_with_code=3)

    if not args:
        qconfig.usage()
        sys.exit(0)

    reload(qconfig)

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage()
        sys.exit(2)

    for opt, arg in options[:]:
        if opt in ('-d', '--debug'):
            options.remove((opt, arg))
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        if opt == '--test':
            options.remove((opt, arg))
            options += [('-o', 'quast_test_output'),
                        ('-R', 'test_data/reference.fasta.gz'),   # for compiling MUMmer
                        ('-O', 'test_data/operons.gff'),
                        ('-G', 'test_data/genes.gff'),
                        ('--gene-finding',''), ('--eukaryote','')] # for compiling GlimmerHMM
            contigs_fpaths += ['test_data/contigs_1.fasta',
                               'test_data/contigs_2.fasta']
            qconfig.test = True

        if opt.startswith('--help'):
            qconfig.usage(opt == "--help-hidden")
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage()
        sys.exit(2)

    json_output_dirpath = None
    output_dirpath = None

    labels = None
    all_labels_from_dirs = False

    ref_fpath = ''
    genes_fpaths = []
    operons_fpaths = []

    # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7.
    for opt, arg in options:
        if opt in ('-o', "--output-dir"):
            output_dirpath = os.path.abspath(arg)
            qconfig.make_latest_symlink = False

        elif opt in ('-G', "--genes"):
            genes_fpaths.append(assert_file_exists(arg, 'genes'))

        elif opt in ('-O', "--operons"):
            operons_fpaths.append(assert_file_exists(arg, 'operons'))

        elif opt in ('-R', "--reference"):
            ref_fpath = assert_file_exists(arg, 'reference')

        elif opt in ('-t', "--contig-thresholds"):
            qconfig.contig_thresholds = arg

        elif opt in ('-M', "--min-contig"):
            qconfig.min_contig = int(arg)

        elif opt in ('-T', "--threads"):
            qconfig.max_threads = int(arg)
            if qconfig.max_threads < 1:
                qconfig.max_threads = 1

        elif opt in ('-c', "--mincluster"):
            qconfig.mincluster = int(arg)

        elif opt == "--est-ref-size":
            qconfig.estimated_reference_size = int(arg)

        elif opt in ('-S', "--gene-thresholds"):
            qconfig.genes_lengths = arg

        elif opt in ('-j', '--save-json'):
            qconfig.save_json = True

        elif opt in ('-J', '--save-json-to'):
            qconfig.save_json = True
            qconfig.make_latest_symlink = False
            json_output_dirpath = arg

        elif opt in ('-s', "--scaffolds"):
            qconfig.scaffolds = True

        elif opt == "--gage":
            qconfig.with_gage = True

        elif opt in ('-e', "--eukaryote"):
            qconfig.prokaryote = False

        elif opt in ('-f', "--gene-finding"):
            qconfig.gene_finding = True

        elif opt in ('-a', "--ambiguity-usage"):
            if arg in ["none", "one", "all"]:
                qconfig.ambiguity_usage = arg

        elif opt in ('-u', "--use-all-alignments"):
            qconfig.use_all_alignments = True

        elif opt in ('-n', "--strict-NA"):
            qconfig.strict_NA = True

        elif opt == '--no-plots':
            qconfig.draw_plots = False

        elif opt == '--no-html':
            qconfig.html_report = False

        elif opt in ('-m', '--meta'):
            qconfig.meta = True

        elif opt in ('-l', '--labels'):
            labels = parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            all_labels_from_dirs = True
        else:
            logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2)

    for contigs_fpath in contigs_fpaths:
        assert_file_exists(contigs_fpath, 'contigs')

    labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    output_dirpath, json_output_dirpath, existing_alignments = \
        _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    logger.print_command_line([os.path.realpath(__file__)] + args, wrap_after=None)
    logger.start()

    if existing_alignments:
        logger.info()
        logger.notice("Output directory already exists. Existing Nucmer alignments can be used.")
        qutils.remove_reports(output_dirpath)

    if qconfig.contig_thresholds == "None":
        qconfig.contig_thresholds = []
    else:
        qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(","))
    if qconfig.genes_lengths == "None":
        qconfig.genes_lengths = []
    else:
        qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(","))

    # Threading
    if qconfig.max_threads is None:
        try:
            import multiprocessing
            qconfig.max_threads = multiprocessing.cpu_count()
        except:
            logger.warning('Failed to determine the number of CPUs')
            qconfig.max_threads = qconfig.DEFAULT_MAX_THREADS

        logger.info()
        logger.notice('Maximum number of threads is set to ' + str(qconfig.max_threads) + ' (use --threads option to set it manually)')


    ########################################################################
    from libs import reporting
    reload(reporting)

    if os.path.isdir(corrected_dirpath):
        shutil.rmtree(corrected_dirpath)
    os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCE
    if ref_fpath:
        logger.info()
        logger.info('Reference:')
        ref_fpath = _correct_reference(ref_fpath, corrected_dirpath)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.info()
    logger.info('Contigs:')
    contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    if not contigs_fpaths:
        logger.error("None of the assembly files contains correct contigs. "
              "Please, provide different files or decrease --min-contig threshold.",
              fake_if_nested_run=True)
        return 4

    if qconfig.with_gage:
        ########################################################################
        ### GAGE
        ########################################################################
        if not ref_fpath:
            logger.warning("GAGE can't be run without a reference and will be skipped.")
        else:
            from libs import gage
            gage.do(ref_fpath, contigs_fpaths, output_dirpath)

    # Where all pdfs will be saved
    all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)
    all_pdf_file = None

    if qconfig.draw_plots:
        from libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.
        try:
            from matplotlib.backends.backend_pdf import PdfPages
            all_pdf_file = PdfPages(all_pdf_fpath)
        except:
            all_pdf_file = None

    ########################################################################
    ### Stats and plots
    ########################################################################
    from libs import basic_stats
    basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'),
                   json_output_dirpath, output_dirpath)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from libs import aligned_stats
        aligned_stats.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from libs import genome_analyzer
        genome_analyzer.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats'))


    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots:
        logger.print_timestamp()
        logger.info('Drawing large plots...')
        logger.info('This may take a while: press Ctrl-C to skip this step..')
        try:
            number_of_steps = sum([int(bool(value)) for value in [detailed_contigs_reports_dirpath, all_pdf_file]])
            if detailed_contigs_reports_dirpath:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.info('  1 of %d: Creating contig alignment plot...' % number_of_steps)
                from libs import contig_alignment_plotter
                contig_alignment_plot_fpath = contig_alignment_plotter.do(
                    contigs_fpaths, os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout'),
                    output_dirpath, ref_fpath, similar=True)

            if all_pdf_file:
                # full report in PDF format: all tables and plots
                logger.info('  %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_file)
            logger.info('Done')
        except KeyboardInterrupt:
            logger.info('..step skipped!')
            os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.info('RESULTS:')
    logger.info('  Text versions of total report are saved to ' + reports_fpaths)
    logger.info('  Text versions of transposed total report are saved to ' + transposed_reports_fpaths)

    if json_output_dirpath:
        json_saver.save_total_report(json_output_dirpath, qconfig.min_contig)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_total_report(output_dirpath, qconfig.min_contig)

    if os.path.isfile(all_pdf_fpath):
        logger.info('  PDF version (tables and plots) saved to ' + all_pdf_fpath)

    if contig_alignment_plot_fpath:
        logger.info('  Contig alignment plot: %s' % contig_alignment_plot_fpath)

    _cleanup(corrected_dirpath)
    logger.finish_up(check_test=qconfig.test)
    return 0
Ejemplo n.º 30
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.info('Running NA-NGA calculation...')

    reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(sum(fastaparser.get_lengths_from_fastafile(contigs_fpath)))

    import N50
    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        nga50 = N50.NG50(lens, reference_length)
        na75 = N50.NG50(lens, assembly_len, 75)
        nga75 = N50.NG50(lens, reference_length, 75)
        la50 = N50.LG50(lens, assembly_len)
        lga50 = N50.LG50(lens, reference_length)
        la75 = N50.LG50(lens, assembly_len, 75)
        lga75 = N50.LG50(lens, reference_length, 75)
        logger.info('  ' +
                    qutils.index_to_str(i) +
                    qutils.label_from_fpath(contigs_fpath) +
                 ', Largest alignment = ' + str(max(lens)) +
                 ', NA50 = ' + str(na50) +
                 ', NGA50 = ' + str(nga50) +
                 ', LA50 = ' + str(la50) +
                 ', LGA50 = ' + str(lga50))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NGA50, nga50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.NGA75, nga75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LGA50, lga50)
        report.add_field(reporting.Fields.LA75, la75)
        report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    # saving to JSON
    if json_output_dirpath:
        from libs.html_saver import json_saver
        json_saver.save_aligned_contigs_lengths(json_output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists)
        json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_aligned_contigs_lengths(output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists)
        html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        import plotter
        plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
                                os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
                                'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
        plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths)
        plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))])

    logger.info('Done.')
    return report_dict
Ejemplo n.º 31
0
Archivo: quast.py Proyecto: ctb/quast
def main(args):
    if ' ' in qconfig.QUAST_HOME:
        logger.error('QUAST does not support spaces in paths. \n'
                     'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n'
                     'Please, put QUAST in a different directory, then try again.\n',
                     to_stderr=True,
                     exit_with_code=3)

    if not args:
        qconfig.usage()
        sys.exit(0)

    reload(qconfig)

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage()
        sys.exit(2)

    for opt, arg in options[:]:

        if opt == '--test' or opt == '--test-sv':
            options.remove((opt, arg))
            options += [('-o', 'quast_test_output'),
                        ('-R', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reference.fasta.gz')),  # for compiling MUMmer
                        ('-O', os.path.join(qconfig.QUAST_HOME, 'test_data', 'operons.gff')),
                        ('-G', os.path.join(qconfig.QUAST_HOME, 'test_data', 'genes.gff')),
                        ('--gage', ''),  # for compiling GAGE Java classes
                        ('--gene-finding', ''), ('--eukaryote', ''), ('--glimmer', '')]  # for compiling GlimmerHMM
            if opt == '--test-sv':
                options += [('-1', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads1.fastq.gz')),
                            ('-2', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads2.fastq.gz'))]
            contigs_fpaths += [os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_1.fasta'),
                               os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_2.fasta')]
            qconfig.test = True

        if opt.startswith('--help') or opt == '-h':
            qconfig.usage(opt == "--help-hidden", short=False)
            sys.exit(0)

        elif opt.startswith('--version') or opt == '-v':
            qconfig.print_version()
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage()
        sys.exit(2)

    json_output_dirpath = None
    output_dirpath = None

    labels = None
    all_labels_from_dirs = False
    qconfig.is_combined_ref = False

    ref_fpath = ''
    genes_fpaths = []
    operons_fpaths = []
    bed_fpath = None
    reads_fpath_f = ''
    reads_fpath_r = ''

    # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7.
    for opt, arg in options:
        if opt in ('-d', '--debug'):
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        elif opt in ('-o', "--output-dir"):
            output_dirpath = os.path.abspath(arg)
            qconfig.make_latest_symlink = False
            if ' ' in output_dirpath:
                logger.error('QUAST does not support spaces in paths. \n'
                             'You have specified ' + str(output_dirpath) + ' as an output path.\n'
                             'Please, use a different directory.\n',
                             to_stderr=True,
                             exit_with_code=3)

        elif opt in ('-G', "--genes"):
            genes_fpaths.append(assert_file_exists(arg, 'genes'))

        elif opt in ('-O', "--operons"):
            operons_fpaths.append(assert_file_exists(arg, 'operons'))

        elif opt in ('-R', "--reference"):
            ref_fpath = assert_file_exists(arg, 'reference')

        elif opt == "--contig-thresholds":
            qconfig.contig_thresholds = arg

        elif opt in ('-m', "--min-contig"):
            qconfig.min_contig = int(arg)

        elif opt in ('-t', "--threads"):
            qconfig.max_threads = int(arg)
            if qconfig.max_threads < 1:
                qconfig.max_threads = 1

        elif opt in ('-c', "--min-cluster"):
            qconfig.min_cluster = int(arg)

        elif opt in ('-i', "--min-alignment"):
            qconfig.min_alignment = int(arg)

        elif opt == "--est-ref-size":
            qconfig.estimated_reference_size = int(arg)

        elif opt == "--gene-thresholds":
            qconfig.genes_lengths = arg

        elif opt in ('-j', '--save-json'):
            qconfig.save_json = True

        elif opt in ('-J', '--save-json-to'):
            qconfig.save_json = True
            qconfig.make_latest_symlink = False
            json_output_dirpath = arg

        elif opt == '--err-fpath':  # for web-quast
            qconfig.save_error = True
            qconfig.error_log_fname = arg

        elif opt in ('-s', "--scaffolds"):
            qconfig.scaffolds = True

        elif opt == "--gage":
            qconfig.with_gage = True

        elif opt in ('-e', "--eukaryote"):
            qconfig.prokaryote = False

        elif opt in ('-f', "--gene-finding"):
            qconfig.gene_finding = True

        elif opt in ('-a', "--ambiguity-usage"):
            if arg in ["none", "one", "all"]:
                qconfig.ambiguity_usage = arg

        elif opt in ('-u', "--use-all-alignments"):
            qconfig.use_all_alignments = True

        elif opt == "--strict-NA":
            qconfig.strict_NA = True

        elif opt in ('-x', "--extensive-mis-size"):
            if int(arg) <= qconfig.MAX_INDEL_LENGTH:
                logger.error("--extensive-mis-size should be greater than maximum indel length (%d)!"
                             % qconfig.MAX_INDEL_LENGTH, 1, to_stderr=True)
            qconfig.extensive_misassembly_threshold = int(arg)

        elif opt == '--no-snps':
            qconfig.show_snps = False

        elif opt == '--no-plots':
            qconfig.draw_plots = False

        elif opt == '--no-html':
            qconfig.html_report = False

        elif opt == '--no-check':
            qconfig.no_check = True

        elif opt == '--no-gc':
            qconfig.no_gc = True

        elif opt == '--fast':  # --no-gc, --no-plots, --no-snps
            #qconfig.no_check = True  # too risky to include
            qconfig.no_gc = True
            qconfig.show_snps = False
            qconfig.draw_plots = False
            qconfig.html_report = False

        elif opt == '--plots-format':
            if arg.lower() in qconfig.supported_plot_extensions:
                qconfig.plot_extension = arg.lower()
            else:
                logger.error('Format "%s" is not supported. Please, use one of the supported formats: %s.' %
                             (arg, ', '.join(qconfig.supported_plot_extensions)), to_stderr=True, exit_with_code=2)

        elif opt == '--meta':
            qconfig.meta = True

        elif opt == '--no-check-meta':
            qconfig.no_check = True
            qconfig.no_check_meta = True

        elif opt == '--references-list':
            pass

        elif opt in ('-l', '--labels'):
            labels = parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            all_labels_from_dirs = True

        elif opt == '--glimmer':
            qconfig.glimmer = True

        elif opt == '--combined-ref':
            qconfig.is_combined_ref = True

        elif opt == '--memory-efficient':
            qconfig.memory_efficient = True

        elif opt == '--silent':
            qconfig.silent = True

        elif opt in ('-1', '--reads1'):
            reads_fpath_f = arg
        elif opt in ('-2', '--reads2'):
            reads_fpath_r = arg
        elif opt == '--bed-file':
            bed_fpath = arg

        elif opt == '--contig-alignment-html':
            qconfig.create_contig_alignment_html = True
        else:
            logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2)

    for contigs_fpath in contigs_fpaths:
        assert_file_exists(contigs_fpath, 'contigs')

    labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    output_dirpath, json_output_dirpath, existing_alignments = \
        _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    args = [os.path.realpath(__file__)]
    for k, v in options: args.extend([k, v])
    args.extend(contigs_fpaths)
    logger.print_command_line(args, wrap_after=None, is_main=True)
    logger.start()

    if existing_alignments:
        logger.main_info()
        logger.notice("Output directory already exists. Existing Nucmer alignments can be used.")
        qutils.remove_reports(output_dirpath)

    if qconfig.contig_thresholds == "None":
        qconfig.contig_thresholds = []
    else:
        qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(","))
    if qconfig.genes_lengths == "None":
        qconfig.genes_lengths = []
    else:
        qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(","))

    qconfig.set_max_threads(logger)

    logger.main_info()
    logger.print_params()

    ########################################################################
    from libs import reporting
    reload(reporting)

    if qconfig.is_combined_ref:
        corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname)
    else:
        if os.path.isdir(corrected_dirpath):
            shutil.rmtree(corrected_dirpath)
        os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCE
    if ref_fpath:
        logger.main_info()
        logger.main_info('Reference:')
        ref_fpath = _correct_reference(ref_fpath, corrected_dirpath)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')

    contigs_fpaths, old_contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    reads_fpaths = []
    if reads_fpath_f:
        reads_fpaths.append(reads_fpath_f)
    if reads_fpath_r:
        reads_fpaths.append(reads_fpath_r)
    if reads_fpaths:
        bed_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths, reads_fpaths, None,
                                      os.path.join(output_dirpath, qconfig.variation_dirname),
                                      external_logger=logger)

    if not contigs_fpaths:
        logger.error("None of the assembly files contains correct contigs. "
              "Please, provide different files or decrease --min-contig threshold.",
              fake_if_nested_run=True)
        return 4

    qconfig.assemblies_fpaths = contigs_fpaths
    if qconfig.with_gage:
        ########################################################################
        ### GAGE
        ########################################################################
        if not ref_fpath:
            logger.warning("GAGE can't be run without a reference and will be skipped.")
        else:
            from libs import gage
            gage.do(ref_fpath, contigs_fpaths, output_dirpath)

    # Where all pdfs will be saved
    all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)
    all_pdf_file = None

    if qconfig.draw_plots or qconfig.html_report:
        from libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.
        try:
            from matplotlib.backends.backend_pdf import PdfPages
            all_pdf_file = PdfPages(all_pdf_fpath)
        except:
            all_pdf_file = None

    if json_output_dirpath:
        from libs.html_saver import json_saver
        if json_saver.simplejson_error:
            json_output_dirpath = None


    ########################################################################
    ### Stats and plots
    ########################################################################
    from libs import basic_stats
    basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'),
                   json_output_dirpath, output_dirpath)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from libs import contigs_analyzer
        nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports'), old_contigs_fpaths, bed_fpath)
        for contigs_fpath in contigs_fpaths:
            if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from libs import aligned_stats
        aligned_stats.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from libs import genome_analyzer
        genome_analyzer.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats'))

    if qconfig.gene_finding or qconfig.glimmer:
        if qconfig.glimmer:
            ########################################################################
            ### Glimmer
            ########################################################################
            from libs import glimmer
            glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'))
        else:
            ########################################################################
            ### GeneMark
            ########################################################################
            from libs import genemark
            genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.prokaryote,
                        qconfig.meta)
            
    else:
        logger.main_info("")
        logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.")
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots:
        logger.print_timestamp()
        logger.main_info('Drawing large plots...')
        logger.main_info('This may take a while: press Ctrl-C to skip this step..')
        try:
            if detailed_contigs_reports_dirpath and qconfig.show_snps:
                contig_report_fpath_pattern = os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout')
            else:
                contig_report_fpath_pattern = None
            number_of_steps = sum([int(bool(value)) for value in [contig_report_fpath_pattern, all_pdf_file]])
            if contig_report_fpath_pattern:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.main_info('  1 of %d: Creating contig alignment plot...' % number_of_steps)
                from libs import contig_alignment_plotter
                contig_alignment_plot_fpath = contig_alignment_plotter.do(
                    contigs_fpaths, contig_report_fpath_pattern,
                    output_dirpath, ref_fpath, similar=True)

            if all_pdf_file:
                # full report in PDF format: all tables and plots
                logger.main_info('  %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_file)
            logger.main_info('Done')
        except KeyboardInterrupt:
            logger.main_info('..step skipped!')
            os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.main_info('RESULTS:')
    logger.main_info('  Text versions of total report are saved to ' + reports_fpaths)
    logger.main_info('  Text versions of transposed total report are saved to ' + transposed_reports_fpaths)

    if json_output_dirpath:
        json_saver.save_total_report(json_output_dirpath, qconfig.min_contig, ref_fpath)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls)
        html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath)

    if os.path.isfile(all_pdf_fpath):
        logger.main_info('  PDF version (tables and plots) saved to ' + all_pdf_fpath)

    if contig_alignment_plot_fpath:
        logger.main_info('  Contig alignment plot: %s' % contig_alignment_plot_fpath)

    _cleanup(corrected_dirpath)
    logger.finish_up(check_test=qconfig.test)
    return 0
Ejemplo n.º 32
0
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
                        reference_chromosomes, genes_container, operons_container):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    results = dict()

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    nucmer_base_fpath = os.path.join(nucmer_path_dirpath, assembly_name + '.coords')
    if qconfig.use_all_alignments:
        nucmer_fpath = nucmer_base_fpath
    else:
        nucmer_fpath = nucmer_base_fpath + '.filtered'

    if not os.path.isfile(nucmer_fpath):
        logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.',
            indent='  ')

    coordfile = open(nucmer_fpath, 'r')
    for line in coordfile:
        if line.startswith('='):
            break

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.iteritems():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True)
    sorted_contigs_names = [name for (name, seq) in contig_tuples]

    genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock
    for name in sorted_contigs_names:
        aligned_blocks_by_contig_name[name] = []

    for line in coordfile:
        if line.strip() == '':
            break
        s1 = int(line.split('|')[0].split()[0])
        e1 = int(line.split('|')[0].split()[1])
        s2 = int(line.split('|')[1].split()[0])
        e2 = int(line.split('|')[1].split()[1])
        contig_name = line.split()[12].strip()
        chr_name = line.split()[11].strip()

        if chr_name not in genome_mapping:
            logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \
                         "differ from the names in the reference. Try to remove the file and restart QUAST.")

        aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1))
        if s2 == 0 and e2 == 0:  # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning
            for i in range(s1, len(genome_mapping[chr_name])):
                genome_mapping[chr_name][i] = 1
            for i in range(1, e1 + 1):
                genome_mapping[chr_name][i] = 1
        else: #if s1 <= e1:
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1
    coordfile.close()

    # counting genome coverage and gaps number
    covered_bp = 0
    gaps_count = 0
    gaps_fpath = os.path.join(genome_stats_dirpath, assembly_name + '_gaps.txt')
    gaps_file = open(gaps_fpath, 'w')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        print >>gaps_file, chr_name
        cur_gap_size = 0
        for i in range(1, chr_len + 1):
            if genome_mapping[chr_name][i] == 1:
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    print >>gaps_file, i - cur_gap_size, i - 1
                covered_bp += 1
                cur_gap_size = 0
            else:
                cur_gap_size += 1

        if cur_gap_size >= qconfig.min_gap_size:
            gaps_count += 1
            print >>gaps_file, chr_len - cur_gap_size + 1, chr_len
    gaps_file.close()

    results["covered_bp"] = covered_bp
    results["gaps_count"] = gaps_count

    # finding genes and operons
    for container, feature_in_contigs, field, suffix in [
        (genes_container,
         genes_in_contigs,
         reporting.Fields.GENES,
         '_genes.txt'),

        (operons_container,
         operons_in_contigs,
         reporting.Fields.OPERONS,
         '_operons.txt')]:

        if not container.region_list:
            results[field + "_full"] = None
            results[field + "_partial"] = None
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath, assembly_name + suffix)
        found_file = open(found_fpath, 'w')
        print >>found_file, '%s\t\t%s\t%s' % ('ID or #', 'Start', 'End')
        print >>found_file, '============================'

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if container.chr_names_dict[region.seqname] != cur_block.seqname:
                        continue

                    # computing circular genomes
                    if cur_block.start > cur_block.end:
                        blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1),
                                  AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)]
                    else:
                        blocks = [cur_block]

                    for block in blocks:
                        if region.end <= block.start or block.end <= region.start:
                            continue
                        elif block.start <= region.start and region.end <= block.end:
                            if found_list[i] == 2:  # already found as partial gene
                                total_partial -= 1
                            found_list[i] = 1
                            total_full += 1
                            i = str(region.id)
                            if i == 'None':
                                i = '# ' + str(region.number + 1)
                            print >>found_file, '%s\t\t%d\t%d' % (i, region.start, region.end)
                            feature_in_contigs[contig_id] += 1  # inc number of found genes/operons in id-th contig

                            cur_feature_is_found = True
                            break
                        elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap:
                            found_list[i] = 2
                            total_partial += 1
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break

        results[field + "_full"] = total_full
        results[field + "_partial"] = total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    return results, genes_in_contigs, operons_in_contigs
Ejemplo n.º 33
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir,
       results_dir):
    logger.print_timestamp()
    logger.info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(
            fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) +
                    ', Reference length = ' + str(reference_length) +
                    ', Reference GC % = ' + '%.2f' % reference_GC)
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info('  Estimated reference length = ' + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver
            html_saver.save_reference_length(results_dir, reference_length)

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths,
                                        lists_of_lengths)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths,
                                        lists_of_lengths)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
            itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath)
        list_of_GC_distributions.append(GC_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) )

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
        report.add_field(reporting.Fields.TOTALLEN, total_length)
        report.add_field(reporting.Fields.GC,
                         ('%.2f' % total_GC if total_GC else None))
        report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
        report.add_field(
            reporting.Fields.UNCALLED_PERCENT,
            ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths,
                                list_of_GC_distributions)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths,
                                list_of_GC_distributions)

    if qconfig.draw_plots:
        import plotter
        ########################################################################import plotter
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths,
                                output_dirpath + '/cumulative_plot',
                                'Cumulative length')

        ########################################################################
        # Drawing GC content plot...
        list_of_GC_distributions_with_ref = list_of_GC_distributions
        if ref_fpath:
            list_of_GC_distributions_with_ref.append(reference_GC_distribution)
        # Drawing cumulative plot...
        plotter.GC_content_plot(ref_fpath, contigs_fpaths,
                                list_of_GC_distributions_with_ref,
                                output_dirpath + '/GC_content_plot')

        ########################################################################
        # Drawing Nx and NGx plots...
        plotter.Nx_plot(contigs_fpaths, lists_of_lengths,
                        output_dirpath + '/Nx_plot', 'Nx', [])
        if reference_length:
            plotter.Nx_plot(
                contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot',
                'NGx', [reference_length for i in range(len(contigs_fpaths))])

    logger.info('Done.')
Ejemplo n.º 34
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info("  Reference genome:")
        logger.info(
            "    "
            + os.path.basename(ref_fpath)
            + ", Reference length = "
            + str(reference_length)
            + ", Reference GC % = "
            + "%.2f" % reference_GC
        )
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info("  Estimated reference length = " + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver

            html_saver.save_reference_length(results_dir, reference_length)

    logger.info("  Contig files: ")
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info("    " + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count("N")

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])

    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math

        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs / multiplicator
        lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
        corr_lists_of_lengths = [
            [
                sum(list_of_length[((i - 1) * multiplicator) : (i * multiplicator)])
                for i in range(1, max_points)
                if (i * multiplicator) < len(list_of_length)
            ]
            for list_of_length in lists_of_lengths
        ]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator :]))
    else:
        corr_lists_of_lengths = lists_of_lengths

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths)
        json_saver.save_tick_x(json_output_dir, multiplicator)

    if qconfig.html_report:
        from libs.html_saver import html_saver

        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info("  Calculating N50 and L50...")

    list_of_GC_distributions = []
    largest_contig = 0
    import N50

    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
        itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)
    ):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        logger.info(
            "    "
            + qutils.index_to_str(id)
            + qutils.label_from_fpath(contigs_fpath)
            + ", N50 = "
            + str(n50)
            + ", L50 = "
            + str(l50)
            + ", Total length = "
            + str(total_length)
            + ", GC % = "
            + ("%.2f" % total_GC if total_GC is not None else "undefined")
            + ", # N's per 100 kbp = "
            + " %.2f" % (float(number_of_Ns) * 100000.0 / float(total_length))
            if total_length != 0
            else "undefined"
        )

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ("%.2f" % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(
                reporting.Fields.UNCALLED_PERCENT, ("%.2f" % (float(number_of_Ns) * 100000.0 / float(total_length)))
            )
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, "%.2f" % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math

    qconfig.min_difference = math.ceil((largest_contig / 1000) / 600)  # divide on height of plot

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report and not qconfig.is_combined_ref:
        from libs.html_saver import html_saver

        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    import plotter

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(
        results_dir,
        num_contigs > qconfig.max_points,
        contigs_fpaths,
        lists_of_lengths,
        output_dirpath + "/Nx_plot",
        "Nx",
        [],
        json_output_dir=json_output_dir,
    )
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(
            results_dir,
            num_contigs > qconfig.max_points,
            contigs_fpaths,
            lists_of_lengths,
            output_dirpath + "/NGx_plot",
            "NGx",
            [reference_length for i in range(len(contigs_fpaths))],
            json_output_dir=json_output_dir,
        )

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(
            ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + "/cumulative_plot", "Cumulative length"
        )
        if not qconfig.is_combined_ref:
            ########################################################################
            # Drawing GC content plot...
            list_of_GC_distributions_with_ref = list_of_GC_distributions
            if ref_fpath:
                list_of_GC_distributions_with_ref.append(reference_GC_distribution)
            plotter.GC_content_plot(
                ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + "/GC_content_plot"
            )

    logger.main_info("Done.")
Ejemplo n.º 35
0
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning(
            "GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    elif prokaryote:
        tool_name = 'GeneMarkS'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC
    else:
        tool_name = 'GeneMark-ES'
        tool_dirname = 'genemark-es'
        gmhmm_p_function = gm_es

    logger.main_info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname,
                                qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning(
            '  Sorry, can\'t use %s on this platform, skipping gene prediction.'
            % tool_name)
    else:
        successful = install_genemark(
            os.path.join(qconfig.LIBS_LOCATION, 'genemark',
                         qconfig.platform_name))
        if not successful:
            return

        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        num_threads = max(1, qconfig.max_threads // n_jobs)
        from joblib import Parallel, delayed
        results = Parallel(n_jobs=n_jobs)(
            delayed(predict_genes)(index, fasta_fpath, gene_lengths,
                                   out_dirpath, tool_dirpath, tmp_dirpath,
                                   gmhmm_p_function, prokaryote, num_threads)
            for index, fasta_fpath in enumerate(fasta_fpaths))

        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            unique_count, count = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE,
                                 unique_count)
            if count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES, count)
            if unique_count is None and count is None:
                logger.error(
                    '  ' + qutils.index_to_str(i) +
                    'Failed predicting genes in ' +
                    qutils.label_from_fpath(fasta_path) + '. ' +
                    ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).'
                     if tool_name == 'GeneMark-ES'
                     and os.path.getsize(fasta_path) < 2000000 else ''))

        if not qconfig.debug:
            shutil.rmtree(tmp_dirpath)

        logger.main_info('Done.')
Ejemplo n.º 36
0
def js_data_gen(assemblies, contigs_fpaths, chr_names, chromosomes_length, output_dir_path, cov_fpath, ref_fpath, genome_size):
    chr_to_aligned_blocks = dict()
    for chr in chr_names:
        chr_init = []
        for fpath in contigs_fpaths:
            f = Alignment('FICTIVE', 0, 0, 0, 0, False, 0, 0, None)
            f.label = qutils.label_from_fpath(fpath)
            f.unshifted_start = 0
            f.unshifted_end = 0
            chr_init.append(f)
        chr_to_aligned_blocks.setdefault(chr, chr_init)
    for assembly in assemblies.assemblies:
        for align in assembly.alignments:
            chr_to_aligned_blocks[align.ref_name].append(align)

    summary_fname = 'alignment_summary.html'
    summary_path = os.path.join(output_dir_path, summary_fname)
    output_all_files_dir_path = os.path.join(output_dir_path, alignment_plots_dirname)
    if not os.path.exists(output_all_files_dir_path):
        os.mkdir(output_all_files_dir_path)
    import contigs_analyzer
    if contigs_analyzer.ref_labels_by_chromosomes:
        contig_names_by_refs = contigs_analyzer.ref_labels_by_chromosomes
        chr_full_names = list(set([contig_names_by_refs[contig] for contig in chr_names]))
    elif genome_size < MAX_SIZE_FOR_COMB_PLOT and len(chr_names) >= MIN_CONTIGS_FOR_COMB_PLOT:
        chr_full_names = [NAME_FOR_ONE_PLOT]
    else:
        chr_full_names = chr_names

    if cov_fpath:
        cov_data = dict()
        not_covered = dict()
        cur_len = dict()
        with open(cov_fpath, 'r') as coverage:
            name = chr_names[0]
            contig_to_chr = {}
            for chr in chr_full_names:
                cov_data.setdefault(chr, [])
                not_covered.setdefault(chr, [])
                cur_len.setdefault(chr, 0)
                if contigs_analyzer.ref_labels_by_chromosomes:
                    contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr]
                elif chr == NAME_FOR_ONE_PLOT:
                    contigs = chr_names
                else:
                    contigs = [chr]
                for contig in contigs:
                    contig_to_chr[contig] = chr
            for index, line in enumerate(coverage):
                c = list(line.split())
                name = contig_to_chr[qutils.correct_name(c[0])]
                cur_len[name] += int(c[2])
                if index % 100 == 0 and index > 0:
                    cov_data[name].append(cur_len[name]/100)
                    cur_len[name] = 0
                if c[2] == '0':
                    not_covered[name].append(c[1])
    chr_sizes = {}
    num_contigs = {}
    aligned_bases = genome_analyzer.get_ref_aligned_lengths()
    aligned_bases_by_chr = {}
    num_misassemblies = {}
    aligned_assemblies = {}

    for i, chr in enumerate(chr_full_names):
        short_chr = chr[:30]
        num_misassemblies[chr] = 0
        aligned_bases_by_chr[chr] = []
        aligned_assemblies[chr] = []
        with open(os.path.join(output_all_files_dir_path, 'data_%s.js' % short_chr), 'w') as result:
            result.write('"use strict";\n')
            if contigs_analyzer.ref_labels_by_chromosomes:
                contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr]
                result.write('var links_to_chromosomes = {};\n')
                links_to_chromosomes = []
                used_chromosomes = []
            elif chr == NAME_FOR_ONE_PLOT:
                contigs = chr_names
            else:
                contigs = [chr]
            chr_size = sum([chromosomes_length[contig] for contig in contigs])
            chr_sizes[chr] = chr_size
            num_contigs[chr] = len(contigs)
            for contig in contigs:
                aligned_bases_by_chr[chr].extend(aligned_bases[contig])
            data_str = 'var chromosomes_len = {};\n'
            for contig in contigs:
                l = chromosomes_length[contig]
                data_str += 'chromosomes_len["{contig}"] = {l};\n'.format(**locals())
            result.write(data_str)

            # adding assembly data
            data_str = 'var contig_data = {};\n'
            data_str += 'contig_data["{chr}"] = [ '.format(**locals())
            prev_len = 0
            chr_lengths = [0] + [chromosomes_length[contig] for contig in contigs]
            for num_contig, contig in enumerate(contigs):
                if num_contig > 0:
                    prev_len += chr_lengths[num_contig]
                if len(chr_to_aligned_blocks[contig]) > 0:
                    for alignment in chr_to_aligned_blocks[contig]:
                        if alignment.misassembled:
                            num_misassemblies[chr] += 1
                        corr_start = prev_len + alignment.unshifted_start
                        corr_end = prev_len + alignment.unshifted_end
                        data_str += '{{name: "{alignment.name}", corr_start: {corr_start}, corr_end: {corr_end},' \
                                    'start: {alignment.unshifted_start}, end: {alignment.unshifted_end}, assembly: "{alignment.label}", similar: "{alignment.similar}", misassembled: "{alignment.misassembled}" '.format(**locals())
                        if alignment.name != 'FICTIVE':
                            if len(aligned_assemblies[chr]) < len(contigs_fpaths) and alignment.label not in aligned_assemblies[chr]:
                                aligned_assemblies[chr].append(alignment.label)
                            data_str += ', structure: ['
                            for el in alignment.misassembled_structure:
                                if type(el) == list:
                                    if el[5] in contigs:
                                        num_chr = contigs.index(el[5])
                                        corr_len = sum(chr_lengths[:num_chr+1])
                                    else:
                                        corr_len = -int(el[1])
                                        if contigs_analyzer.ref_labels_by_chromosomes and el[5] not in used_chromosomes:
                                            used_chromosomes.append(el[5])
                                            new_chr = contig_names_by_refs[el[5]]
                                            links_to_chromosomes.append('links_to_chromosomes["{el[5]}"] = "{new_chr}";\n'.format(**locals()))
                                    corr_start = corr_len + int(el[0])
                                    corr_end = corr_len + int(el[1])
                                    data_str += '{{type: "A", corr_start: {corr_start}, corr_end: {corr_end}, start: {el[0]}, end: {el[1]}, start_in_contig: {el[2]}, end_in_contig: {el[3]}, IDY: {el[4]}, chr: "{el[5]}"}},'.format(**locals())
                                elif type(el) == str:
                                    data_str += '{{type: "M", mstype: "{el}"}},'.format(**locals())
                            if data_str[-1] == '[':
                                data_str = data_str + ']},'
                            else:
                                data_str = data_str[: -1] + ']},'
                        else: data_str += '},'
            data_str = data_str[:-1] + '];\n\n'
            result.write(data_str)
            if contigs_analyzer.ref_labels_by_chromosomes:
                result.write(''.join(links_to_chromosomes))
            if cov_fpath:
                # adding coverage data
                data_str = 'var coverage_data = {};\n'
                if cov_data[chr]:
                    data_str += 'coverage_data["{chr}"] = [ '.format(**locals())
                    for e in cov_data[chr]:
                        data_str += '{e},'.format(**locals())
                        if len(data_str) > 10000 and e != cov_data[chr][-1]:
                            result.write(data_str)
                            data_str = ''
                    data_str = data_str[:-1] + '];\n'
                    result.write(data_str)
                    data_str = ''

                data_str = 'var not_covered = {};\n'
                data_str += 'not_covered["{chr}"] = [ '.format(**locals())
                if len(not_covered[chr]) > 0:
                    for e in not_covered[chr]:
                        data_str += '{e},'.format(**locals())
                        if len(data_str) > 10000 and e != cov_data[chr][-1]:
                            result.write(data_str)
                            data_str = ''
                    data_str = data_str[:-1]
                data_str += '];\n'
                result.write(data_str)
                data_str = ''

            with open(html_saver.get_real_path('_chr_templ.html'), 'r') as template:
                with open(os.path.join(output_all_files_dir_path, '_{short_chr}.html'.format(**locals())), 'w') as result:
                    for line in template:
                        if line.find('<script type="text/javascript" src=""></script>') != -1:
                            result.write('<script type="text/javascript" src="data_{short_chr}.js"></script>\n'.format(**locals()))
                        else:
                            result.write(line)
                            if line.find('<body>') != -1:
                                chr_size = chr_sizes[chr]
                                chr_name = chr.replace('_', ' ')
                                if len(chr_name) > 50:
                                    chr_name = chr_name[:50] + '...'
                                title = 'CONTIG ALIGNMENT BROWSER: %s (' % chr_name + ('%s fragments, ' % num_contigs[chr] if num_contigs[chr] > 1 else '') + '%s bp)' % format_long_numbers(chr_size)
                                result.write('<div class = "block title"><a href="../{summary_fname}"><button class="back_button">&crarr;</button></a>{title}</div>\n'.format(**locals()))
                            if line.find('<script type="text/javascript">') != -1:
                                chromosome = '","'.join(contigs)
                                result.write('var CHROMOSOME = "{chr}";\n'.format(**locals()))
                                result.write('var chrContigs = ["{chromosome}"];\n'.format(**locals()))

    with open(html_saver.get_real_path('alignment_summary_templ.html'), 'r') as template:
        with open(summary_path, 'w') as result:
            num_aligned_assemblies = [len(aligned_assemblies[chr]) for chr in chr_full_names]
            is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1
            for line in template:
                result.write(line)
                if line.find('<!--- assemblies: ---->') != -1:
                    if not is_unaligned_asm_exists:
                        result.write('<div class="subtitle"># assemblies: %s</div>' % len(contigs_fpaths))
                if line.find('<!--- th_assemblies: ---->') != -1:
                    if is_unaligned_asm_exists:
                        result.write('<th># assemblies</th>')
                if line.find('<!--- references: ---->') != -1:
                    for chr in sorted(chr_full_names):
                        result.write('<tr>')
                        short_chr = chr[:30]
                        chr_link = os.path.join(alignment_plots_dirname, '_{short_chr}.html'.format(**locals()))
                        chr_name = chr.replace('_', ' ')
                        aligned_lengths = [aligned_len for aligned_len in aligned_bases_by_chr[chr] if aligned_len is not None]
                        chr_genome = sum(aligned_lengths) * 100.0 / (chr_sizes[chr] * len(contigs_fpaths))
                        chr_size = chr_sizes[chr]
                        result.write('<td><a href="%s">%s</a></td>' % (chr_link, chr_name))
                        result.write('<td>%s</td>' % num_contigs[chr])
                        result.write('<td>%s</td>' % format_long_numbers(chr_size))
                        if is_unaligned_asm_exists:
                            result.write('<td>%s</td>' % len(aligned_assemblies[chr]))
                        result.write('<td>%.3f</td>' % chr_genome)
                        result.write('<td>%s</td>' % num_misassemblies[chr])
                        result.write('</tr>')

    copyfile(html_saver.get_real_path(os.path.join('static', 'contig_alignment_plot.css')),
             os.path.join(output_all_files_dir_path, 'contig_alignment_plot.css'))
    copyfile(html_saver.get_real_path(os.path.join('static', 'd3.js')),
             os.path.join(output_all_files_dir_path, 'd3.js'))
    copyfile(html_saver.get_real_path(os.path.join('static', 'scripts', 'contig_alignment_plot_script.js')),
             os.path.join(output_all_files_dir_path, 'contig_alignment_plot_script.js'))
Ejemplo n.º 37
0
def process_single_file(contigs_fpath, index, nucmer_path_dirpath,
                        genome_stats_dirpath, reference_chromosomes,
                        genes_container, operons_container):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    results = dict()

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    nucmer_base_fpath = os.path.join(nucmer_path_dirpath,
                                     assembly_name + '.coords')
    if qconfig.use_all_alignments:
        nucmer_fpath = nucmer_base_fpath
    else:
        nucmer_fpath = nucmer_base_fpath + '.filtered'

    if not os.path.isfile(nucmer_fpath):
        logger.error('Nucmer\'s coords file (' + nucmer_fpath +
                     ') not found! Try to restart QUAST.',
                     indent='  ')

    coordfile = open(nucmer_fpath, 'r')
    for line in coordfile:
        if line.startswith('='):
            break

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.iteritems():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(
        contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    contig_tuples = sorted(contig_tuples,
                           key=lambda contig: len(contig[1]),
                           reverse=True)
    sorted_contigs_names = [name for (name, seq) in contig_tuples]

    genes_in_contigs = [0] * len(
        sorted_contigs_names
    )  # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {
    }  # for gene finding: contig_name --> list of AlignedBlock
    for name in sorted_contigs_names:
        aligned_blocks_by_contig_name[name] = []

    for line in coordfile:
        if line.strip() == '':
            break
        s1 = int(line.split('|')[0].split()[0])
        e1 = int(line.split('|')[0].split()[1])
        s2 = int(line.split('|')[1].split()[0])
        e2 = int(line.split('|')[1].split()[1])
        contig_name = line.split()[12].strip()
        chr_name = line.split()[11].strip()

        if chr_name not in genome_mapping:
            logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \
                         "differ from the names in the reference. Try to remove the file and restart QUAST.")

        aligned_blocks_by_contig_name[contig_name].append(
            AlignedBlock(seqname=chr_name, start=s1, end=e1))
        if s2 == 0 and e2 == 0:  # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning
            for i in range(s1, len(genome_mapping[chr_name])):
                genome_mapping[chr_name][i] = 1
            for i in range(1, e1 + 1):
                genome_mapping[chr_name][i] = 1
        else:  #if s1 <= e1:
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1
    coordfile.close()

    # counting genome coverage and gaps number
    covered_bp = 0
    gaps_count = 0
    gaps_fpath = os.path.join(genome_stats_dirpath,
                              assembly_name + '_gaps.txt')
    gaps_file = open(gaps_fpath, 'w')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        print >> gaps_file, chr_name
        cur_gap_size = 0
        for i in range(1, chr_len + 1):
            if genome_mapping[chr_name][i] == 1:
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    print >> gaps_file, i - cur_gap_size, i - 1
                covered_bp += 1
                cur_gap_size = 0
            else:
                cur_gap_size += 1

        if cur_gap_size >= qconfig.min_gap_size:
            gaps_count += 1
            print >> gaps_file, chr_len - cur_gap_size + 1, chr_len
    gaps_file.close()

    results["covered_bp"] = covered_bp
    results["gaps_count"] = gaps_count

    # finding genes and operons
    for container, feature_in_contigs, field, suffix in [
        (genes_container, genes_in_contigs, reporting.Fields.GENES,
         '_genes.txt'),
        (operons_container, operons_in_contigs, reporting.Fields.OPERONS,
         '_operons.txt')
    ]:

        if not container.region_list:
            results[field + "_full"] = None
            results[field + "_partial"] = None
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath,
                                   assembly_name + suffix)
        found_file = open(found_fpath, 'w')
        print >> found_file, '%s\t\t%s\t%s' % ('ID or #', 'Start', 'End')
        print >> found_file, '============================'

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if container.chr_names_dict[
                            region.seqname] != cur_block.seqname:
                        continue

                    # computing circular genomes
                    if cur_block.start > cur_block.end:
                        blocks = [
                            AlignedBlock(seqname=cur_block.seqname,
                                         start=cur_block.start,
                                         end=region.end + 1),
                            AlignedBlock(seqname=cur_block.seqname,
                                         start=1,
                                         end=cur_block.end)
                        ]
                    else:
                        blocks = [cur_block]

                    for block in blocks:
                        if region.end <= block.start or block.end <= region.start:
                            continue
                        elif block.start <= region.start and region.end <= block.end:
                            if found_list[
                                    i] == 2:  # already found as partial gene
                                total_partial -= 1
                            found_list[i] = 1
                            total_full += 1
                            i = str(region.id)
                            if i == 'None':
                                i = '# ' + str(region.number + 1)
                            print >> found_file, '%s\t\t%d\t%d' % (
                                i, region.start, region.end)
                            feature_in_contigs[
                                contig_id] += 1  # inc number of found genes/operons in id-th contig

                            cur_feature_is_found = True
                            break
                        elif found_list[i] == 0 and min(
                                region.end, block.end) - max(
                                    region.start,
                                    block.start) >= qconfig.min_gene_overlap:
                            found_list[i] = 2
                            total_partial += 1
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break

        results[field + "_full"] = total_full
        results[field + "_partial"] = total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    return results, genes_in_contigs, operons_in_contigs
Ejemplo n.º 38
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")
    
    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC)
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info('  Estimated reference length = ' + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver
            html_saver.save_reference_length(results_dir, reference_length)

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])

    multiplicator = 1
    if num_contigs >= (qconfig.max_points*2):
        import math
        multiplicator = int(num_contigs/qconfig.max_points)
        max_points = num_contigs/multiplicator
        lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
        corr_lists_of_lengths = [[sum(list_of_length[((i-1)*multiplicator):(i*multiplicator)]) for i in range(1, max_points)
                                  if (i*multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index*multiplicator:]))
    else:
        corr_lists_of_lengths = lists_of_lengths

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths)
        json_saver.save_tick_x(json_output_dir, multiplicator)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    largest_contig = 0
    import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')
        
        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil((largest_contig/1000)/600)  # divide on height of plot

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report and not qconfig.is_combined_ref:
        from libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    import plotter
    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [], json_output_dir=json_output_dir)
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))], json_output_dir=json_output_dir)

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length')
        if not qconfig.is_combined_ref:
            ########################################################################
            # Drawing GC content plot...
            list_of_GC_distributions_with_ref = list_of_GC_distributions
            if ref_fpath:
                list_of_GC_distributions_with_ref.append(reference_GC_distribution)
            plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot')

    logger.main_info('Done.')