Beispiel #1
0
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning("GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    elif prokaryote:
        tool_name = 'GeneMarkS'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC
    else:
        tool_name = 'GeneMark-ES'
        tool_dirname = 'genemark-es'
        gmhmm_p_function = gm_es

    logger.main_info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning('  Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name)
    else:
        successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name))
        if not successful:
            return

        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        num_threads = max(1, qconfig.max_threads // n_jobs)
        from joblib import Parallel, delayed
        results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)(
            index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads)
            for index, fasta_fpath in enumerate(fasta_fpaths))

        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            unique_count, count = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count)
            if count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES, count)
            if unique_count is None and count is None:
                logger.error('  ' + qutils.index_to_str(i) +
                     'Failed predicting genes in ' + qutils.label_from_fpath(fasta_path) + '. ' +
                     ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).'
                         if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else ''))

        if not qconfig.debug:
            shutil.rmtree(tmp_dirpath)

        logger.main_info('Done.')
Beispiel #2
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tool_src_dirpath = os.path.join(tool_dirpath, 'src')
    tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')

    if not os.path.isfile(tool_exec_fpath):
        # making
        logger.main_info("Compiling GlimmerHMM...")
        return_code = qutils.call_subprocess(
            ['make', '-C', tool_src_dirpath],
            stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'),
            stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'),
            indent='    ')
        if return_code != 0 or not os.path.isfile(tool_exec_fpath):
            logger.error(
                "Failed to compile GlimmerHMM (" + tool_src_dirpath +
                ")!\nTry to compile it manually or do not use --gene-finding "
                "option with --eukaryote.\nUse --debug option to see the command lines."
            )
            return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    results = Parallel(n_jobs=n_jobs)(
        delayed(predict_genes)(index, contigs_fpath, gene_lengths, out_dirpath,
                               tool_dirpath, tmp_dirpath)
        for index, contigs_fpath in enumerate(contigs_fpaths))

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        unique, cnt = results[i]
        if unique is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique)
        if cnt is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES, cnt)
        if unique is None and cnt is None:
            logger.error(
                'Glimmer failed running Glimmer for %s. ' +
                ('Run with the --debug option'
                 ' to see the command line.' if not qconfig.debug else '') %
                qutils.label_from_fpath(contigs_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
Beispiel #3
0
def do(contigs_fpaths, gene_lengths, out_dirpath):
    logger.print_timestamp()
    logger.main_info('Running GlimmerHMM...')

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tool_src_dirpath = os.path.join(tool_dirpath, 'src')
    tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm')
    tmp_dirpath = os.path.join(out_dirpath, 'tmp')

    if not os.path.isfile(tool_exec_fpath):
        # making
        logger.main_info("Compiling GlimmerHMM...")
        return_code = qutils.call_subprocess(
            ['make', '-C', tool_src_dirpath],
            stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'),
            stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'),
            indent='    ')
        if return_code != 0 or not os.path.isfile(tool_exec_fpath):
            logger.error("Failed to compile GlimmerHMM (" + tool_src_dirpath +
                         ")!\nTry to compile it manually or do not use --gene-finding "
                         "option with --eukaryote.\nUse --debug option to see the command lines.")
            return

    if not os.path.isdir(out_dirpath):
        os.makedirs(out_dirpath)
    if not os.path.isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)(
        index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath)
        for index, contigs_fpath in enumerate(contigs_fpaths))

    # saving results
    for i, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        unique, cnt = results[i]
        if unique is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique)
        if cnt is not None:
            report.add_field(reporting.Fields.PREDICTED_GENES, cnt)
        if unique is None and cnt is None:
            logger.error(
                'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option'
                ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(contigs_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
Beispiel #4
0
def do(fasta_fpaths, gene_lengths, out_dirpath, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning("GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'metagenemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    else:
        tool_name = 'GeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC

    logger.info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning('  Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name)
    else:
        successful = install_genemark(tool_dirpath)
        if not successful:
            return

        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        from joblib import Parallel, delayed
        results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)(
            index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function)
            for index, fasta_fpath in enumerate(fasta_fpaths))

        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            unique_count, count = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count)
            if count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES, count)

        if not qconfig.debug:
            shutil.rmtree(tmp_dirpath)

        logger.info('Done.')
Beispiel #5
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath,
       genome_stats_dirpath):

    nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath,
                                       'nucmer_output')
    from libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    reference_chromosomes = {}
    genome_size = 0
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = genome_stats_dirpath + '/genome_info.txt'
    res_file = open(result_fpath, 'w')

    genes_container = FeatureContainer(genes_fpaths, 'gene')
    operons_container = FeatureContainer(operons_fpaths, 'operon')
    for container in [genes_container, operons_container]:
        if not container.fpaths:
            logger.notice('No file with ' + container.kind + 's provided. '
                          'Use the -' + container.kind[0].capitalize() +
                          ' option '
                          'if you want to specify it.',
                          indent='  ')
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(
                fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No ' + container.kind + 's were loaded.',
                           indent='  ')
            res_file.write(container.kind + 's loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' ' +
                        container.kind + 's')
            res_file.write(container.kind + 's loaded: ' +
                           str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(
                container.kind, container.region_list,
                reference_chromosomes.keys())

    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        if genes_container.fpaths:
            report.add_field(reporting.Fields.REF_GENES,
                             len(genes_container.region_list))
        if operons_container.fpaths:
            report.add_field(reporting.Fields.REF_OPERONS,
                             len(operons_container.region_list))

    # for cumulative plots:
    files_genes_in_contigs = {
    }  #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    process_results = Parallel(n_jobs=n_jobs)(
        delayed(process_single_file)(
            contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
            reference_chromosomes, genes_container, operons_container)
        for index, contigs_fpath in enumerate(aligned_contigs_fpaths))
    num_nf_errors += len([res for res in process_results if res is None])
    logger._num_nf_errors = num_nf_errors
    process_results = [res for res in process_results if res]
    if not process_results:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    ref_lengths = [process_results[i][0] for i in range(len(process_results))]
    results_genes_operons_tuples = [
        process_results[i][1] for i in range(len(process_results))
    ]
    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [
            ref_lengths[i][ref] for i in range(len(ref_lengths))
        ]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) +
                       ' bp, maximal covered length: ' + str(aligned_len) +
                       ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' +
                   str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write(
        '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' %
        ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial',
         'operons', 'partial'))
    res_file.write(
        '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' %
        ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write(
        '================================================================================================================\n'
    )

    for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(
            aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_genes_in_contigs[contigs_fpath] = genes_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        full_found_genes.append(sum(genes_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        covered_bp = results["covered_bp"]
        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)
        genome_fraction = float(covered_bp) * 100 / float(genome_size)
        duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) +
                             report.get_field(reporting.Fields.MISINTERNALOVERLAP) +
                             report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) -
                             report.get_field(reporting.Fields.UNALIGNEDBASES)) /\
                            ((genome_fraction / 100.0) * float(genome_size))

        res_file.write('%-25s| %-10s| %-12s| %-10s|' %
                       (assembly_name[:24], '%3.5f%%' % genome_fraction,
                        '%1.5f' % duplication_ratio, gaps_count))

        report.add_field(reporting.Fields.MAPPEDGENOME,
                         '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO,
                         '%.3f' % duplication_ratio)
        genome_mapped.append(genome_fraction)

        for (field, full,
             part) in [(reporting.Fields.GENES, genes_full, genes_part),
                       (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if genes_container.region_list:
        ref_genes_num = len(genes_container.region_list)
    else:
        ref_genes_num = None

    if operons_container.region_list:
        ref_operons_num = len(operons_container.region_list)
    else:
        ref_operons_num = None

    # saving json
    if json_output_dirpath:
        if genes_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath,
                                                aligned_contigs_fpaths,
                                                'genes',
                                                files_genes_in_contigs,
                                                ref_genes_num)
        if operons_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath,
                                                aligned_contigs_fpaths,
                                                'operons',
                                                files_operons_in_contigs,
                                                ref_operons_num)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        if genes_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath,
                                                aligned_contigs_fpaths,
                                                'genes',
                                                files_genes_in_contigs,
                                                ref_genes_num)
        if operons_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath,
                                                aligned_contigs_fpaths,
                                                'operons',
                                                files_operons_in_contigs,
                                                ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        import plotter
        if genes_container.region_list:
            plotter.genes_operons_plot(
                len(genes_container.region_list), aligned_contigs_fpaths,
                files_genes_in_contigs,
                genome_stats_dirpath + '/genes_cumulative_plot', 'genes')
            plotter.histogram(
                aligned_contigs_fpaths, full_found_genes,
                genome_stats_dirpath + '/complete_genes_histogram',
                '# complete genes')
        if operons_container.region_list:
            plotter.genes_operons_plot(
                len(operons_container.region_list), aligned_contigs_fpaths,
                files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.histogram(
                aligned_contigs_fpaths, full_found_operons,
                genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths,
                          genome_mapped,
                          genome_stats_dirpath + '/genome_fraction_histogram',
                          'Genome fraction, %',
                          top_value=100)

    logger.main_info('Done.')
Beispiel #6
0
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta):
    logger.print_timestamp()
    if LICENSE_LIMITATIONS_MODE:
        logger.warning(
            "GeneMark tool can't be started because of license limitations!")
        return

    if meta:
        tool_name = 'MetaGeneMark'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_metagenomic
    elif prokaryote:
        tool_name = 'GeneMarkS'
        tool_dirname = 'genemark'
        gmhmm_p_function = gmhmm_p_everyGC
    else:
        tool_name = 'GeneMark-ES'
        tool_dirname = 'genemark-es'
        gmhmm_p_function = gm_es

    logger.main_info('Running %s...' % tool_name)

    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname,
                                qconfig.platform_name)
    if not os.path.exists(tool_dirpath):
        logger.warning(
            '  Sorry, can\'t use %s on this platform, skipping gene prediction.'
            % tool_name)
    else:
        successful = install_genemark(
            os.path.join(qconfig.LIBS_LOCATION, 'genemark',
                         qconfig.platform_name))
        if not successful:
            return

        if not os.path.isdir(out_dirpath):
            os.mkdir(out_dirpath)
        tmp_dirpath = os.path.join(out_dirpath, 'tmp')
        if not os.path.isdir(tmp_dirpath):
            os.mkdir(tmp_dirpath)

        n_jobs = min(len(fasta_fpaths), qconfig.max_threads)
        num_threads = max(1, qconfig.max_threads // n_jobs)
        from joblib import Parallel, delayed
        results = Parallel(n_jobs=n_jobs)(
            delayed(predict_genes)(index, fasta_fpath, gene_lengths,
                                   out_dirpath, tool_dirpath, tmp_dirpath,
                                   gmhmm_p_function, prokaryote, num_threads)
            for index, fasta_fpath in enumerate(fasta_fpaths))

        # saving results
        for i, fasta_path in enumerate(fasta_fpaths):
            report = reporting.get(fasta_path)
            unique_count, count = results[i]
            if unique_count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE,
                                 unique_count)
            if count is not None:
                report.add_field(reporting.Fields.PREDICTED_GENES, count)
            if unique_count is None and count is None:
                logger.error(
                    '  ' + qutils.index_to_str(i) +
                    'Failed predicting genes in ' +
                    qutils.label_from_fpath(fasta_path) + '. ' +
                    ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).'
                     if tool_name == 'GeneMark-ES'
                     and os.path.getsize(fasta_path) < 2000000 else ''))

        if not qconfig.debug:
            shutil.rmtree(tmp_dirpath)

        logger.main_info('Done.')
Beispiel #7
0
def do(ref_fpath, contigs_fpaths, output_dirpath):
    gage_results_dirpath = os.path.join(output_dirpath, 'gage')

    # suffixes for files with report tables in plain text and tab separated formats
    if not os.path.isdir(gage_results_dirpath):
        os.mkdir(gage_results_dirpath)

    ########################################################################
    gage_tool_path = os.path.join(qconfig.LIBS_LOCATION, 'gage', 'getCorrectnessStats.sh')

    ########################################################################
    logger.print_timestamp()
    logger.info('Running GAGE...')

    metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases',
               'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs',
               'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp',
               'Indels >= 5', 'Inversions', 'Relocation', 'Translocation',
               'Total units', 'BasesInFasta', 'Min', 'Max', 'N50']
    metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, 
                            reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE,
                            reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, 
                            reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, 
                            reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, 
                            reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, 
                            reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, 
                            reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, 
                            reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50]

    tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp')
    if not os.path.exists(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath)
        for i, contigs_fpath in enumerate(contigs_fpaths))

    if 0 not in return_codes:
        logger.warning('Error occurred while GAGE was processing assemblies.'
                       ' See GAGE error logs for details: %s' %
                os.path.join(gage_results_dirpath, 'gage_*.stderr'))
        return

    ## find metrics for total report:
    for i, contigs_fpath in enumerate(contigs_fpaths):
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        report = reporting.get(contigs_fpath)

        log_out_fpath = os.path.join(
            gage_results_dirpath, 'gage_' + assembly_name + '.stdout')
        logfile_out = open(log_out_fpath, 'r')
        cur_metric_id = 0
        for line in logfile_out:
            if metrics[cur_metric_id] in line:
                if (metrics[cur_metric_id].startswith('N50')):
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip())
                else:
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip())
                cur_metric_id += 1
                if cur_metric_id == len(metrics):
                    break
        logfile_out.close()

    reporting.save_gage(output_dirpath)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.info('Done.')
Beispiel #8
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath):

    nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output')

    logger.print_timestamp()
    logger.info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    reference_chromosomes = {}
    genome_size = 0
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = genome_stats_dirpath + '/genome_info.txt'
    res_file = open(result_fpath, 'w')
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        res_file.write('\t' + chr_name + ' (' + str(chr_len) + ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n')

    genes_container = FeatureContainer(genes_fpaths, 'gene')
    operons_container = FeatureContainer(operons_fpaths, 'operon')
    for container in [genes_container, operons_container]:
        if not container.fpaths:
            logger.notice('No file with ' + container.kind + 's provided. '
                          'Use the -' + container.kind[0].capitalize() + ' option '
                          'if you want to specify it.', indent='  ')
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No ' + container.kind + 's were loaded.', indent='  ')
            res_file.write(container.kind + 's loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's')
            res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, reference_chromosomes.keys())

    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        if genes_container.fpaths:
            report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list))
        if operons_container.fpaths:
            report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list))

    # header
    res_file.write('\n\n')
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial'))
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('================================================================================================================\n')

    # for cumulative plots:
    files_genes_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    results_genes_operons_tuples = Parallel(n_jobs=n_jobs)(delayed(process_single_file)(
        contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
        reference_chromosomes, genes_container, operons_container)
        for index, contigs_fpath in enumerate(aligned_contigs_fpaths))

    for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_genes_in_contigs[contigs_fpath] = genes_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        full_found_genes.append(sum(genes_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        covered_bp = results["covered_bp"]
        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)
        genome_fraction = float(covered_bp) * 100 / float(genome_size)
        duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) +
                             report.get_field(reporting.Fields.MISINTERNALOVERLAP) +
                             report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) -
                             report.get_field(reporting.Fields.UNALIGNEDBASES)) /\
                            ((genome_fraction / 100.0) * float(genome_size))

        res_file.write('%-25s| %-10s| %-12s| %-10s|'
        % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count))

        report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio)
        genome_mapped.append(genome_fraction)

        for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part),
            (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if genes_container.region_list:
        ref_genes_num = len(genes_container.region_list)
    else:
        ref_genes_num = None

    if operons_container.region_list:
        ref_operons_num = len(operons_container.region_list)
    else:
        ref_operons_num = None

    # saving json
    if json_output_dirpath:
        if genes_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        if genes_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        import plotter
        if genes_container.region_list:
            plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs,
                genome_stats_dirpath + '/genes_cumulative_plot', 'genes')
            plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram',
                '# complete genes')
        if operons_container.region_list:
            plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram',
            'Genome fraction, %', top_value=100)

    logger.info('Done.')
Beispiel #9
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running NA-NGA calculation...')

    reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(
            sum(fastaparser.get_lengths_from_fastafile(contigs_fpath)))

    import N50
    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists,
                           assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        na75 = N50.NG50(lens, assembly_len, 75)
        la50 = N50.LG50(lens, assembly_len)
        la75 = N50.LG50(lens, assembly_len, 75)
        if not qconfig.is_combined_ref:
            nga50 = N50.NG50(lens, reference_length)
            nga75 = N50.NG50(lens, reference_length, 75)
            lga50 = N50.LG50(lens, reference_length)
            lga75 = N50.LG50(lens, reference_length, 75)

        logger.info(
            '  ' + qutils.index_to_str(i) +
            qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' +
            str(max(lens)) + ', NA50 = ' + str(na50) +
            (', NGA50 = ' +
             str(nga50) if not qconfig.is_combined_ref and nga50 else '') +
            ', LA50 = ' + str(la50) +
            (', LGA50 = ' +
             str(lga50) if not qconfig.is_combined_ref and lga50 else ''))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LA75, la75)
        if not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NGA50, nga50)
            report.add_field(reporting.Fields.NGA75, nga75)
            report.add_field(reporting.Fields.LGA50, lga50)
            report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    num_contigs = max([
        len(aligned_lengths_lists[i])
        for i in range(len(aligned_lengths_lists))
    ])

    if json_output_dirpath:
        from libs.html_saver import json_saver
        json_saver.save_assembly_lengths(json_output_dirpath,
                                         aligned_contigs_fpaths,
                                         assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_assembly_lengths(output_dirpath,
                                         aligned_contigs_fpaths,
                                         assembly_lengths)

    import plotter
    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        plotter.cumulative_plot(
            ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
            os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
            'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
    plotter.Nx_plot(output_dirpath,
                    num_contigs > qconfig.max_points,
                    aligned_contigs_fpaths,
                    aligned_lengths_lists,
                    aligned_stats_dirpath + '/NAx_plot',
                    'NAx',
                    assembly_lengths,
                    json_output_dir=json_output_dirpath)
    if not qconfig.is_combined_ref:
        plotter.Nx_plot(
            output_dirpath,
            num_contigs > qconfig.max_points,
            aligned_contigs_fpaths,
            aligned_lengths_lists,
            aligned_stats_dirpath + '/NGAx_plot',
            'NGAx',
            [reference_length for i in range(len(aligned_contigs_fpaths))],
            json_output_dir=json_output_dirpath)

    logger.main_info('Done.')
    return report_dict
Beispiel #10
0
def main(args):
    if ' ' in quast_dirpath:
        logger.error('QUAST does not support spaces in paths. \n'
                     'You are trying to run it from ' + str(quast_dirpath) + '\n'
                     'Please, put QUAST in a different directory, then try again.\n',
                     to_stderr=True,
                     exit_with_code=3)

    if not args:
        qconfig.usage()
        sys.exit(0)

    reload(qconfig)

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage()
        sys.exit(2)

    for opt, arg in options[:]:
        if opt in ('-d', '--debug'):
            options.remove((opt, arg))
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        if opt == '--test':
            options.remove((opt, arg))
            options += [('-o', 'quast_test_output'),
                        ('-R', 'test_data/reference.fasta.gz'),   # for compiling MUMmer
                        ('-O', 'test_data/operons.gff'),
                        ('-G', 'test_data/genes.gff'),
                        ('--gene-finding',''), ('--eukaryote','')] # for compiling GlimmerHMM
            contigs_fpaths += ['test_data/contigs_1.fasta',
                               'test_data/contigs_2.fasta']
            qconfig.test = True

        if opt.startswith('--help'):
            qconfig.usage(opt == "--help-hidden")
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage()
        sys.exit(2)

    json_output_dirpath = None
    output_dirpath = None

    labels = None
    all_labels_from_dirs = False

    ref_fpath = ''
    genes_fpaths = []
    operons_fpaths = []

    # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7.
    for opt, arg in options:
        if opt in ('-o', "--output-dir"):
            output_dirpath = os.path.abspath(arg)
            qconfig.make_latest_symlink = False

        elif opt in ('-G', "--genes"):
            genes_fpaths.append(assert_file_exists(arg, 'genes'))

        elif opt in ('-O', "--operons"):
            operons_fpaths.append(assert_file_exists(arg, 'operons'))

        elif opt in ('-R', "--reference"):
            ref_fpath = assert_file_exists(arg, 'reference')

        elif opt in ('-t', "--contig-thresholds"):
            qconfig.contig_thresholds = arg

        elif opt in ('-M', "--min-contig"):
            qconfig.min_contig = int(arg)

        elif opt in ('-T', "--threads"):
            qconfig.max_threads = int(arg)
            if qconfig.max_threads < 1:
                qconfig.max_threads = 1

        elif opt in ('-c', "--mincluster"):
            qconfig.mincluster = int(arg)

        elif opt == "--est-ref-size":
            qconfig.estimated_reference_size = int(arg)

        elif opt in ('-S', "--gene-thresholds"):
            qconfig.genes_lengths = arg

        elif opt in ('-j', '--save-json'):
            qconfig.save_json = True

        elif opt in ('-J', '--save-json-to'):
            qconfig.save_json = True
            qconfig.make_latest_symlink = False
            json_output_dirpath = arg

        elif opt in ('-s', "--scaffolds"):
            qconfig.scaffolds = True

        elif opt == "--gage":
            qconfig.with_gage = True

        elif opt in ('-e', "--eukaryote"):
            qconfig.prokaryote = False

        elif opt in ('-f', "--gene-finding"):
            qconfig.gene_finding = True

        elif opt in ('-a', "--ambiguity-usage"):
            if arg in ["none", "one", "all"]:
                qconfig.ambiguity_usage = arg

        elif opt in ('-u', "--use-all-alignments"):
            qconfig.use_all_alignments = True

        elif opt in ('-n', "--strict-NA"):
            qconfig.strict_NA = True

        elif opt == '--no-plots':
            qconfig.draw_plots = False

        elif opt == '--no-html':
            qconfig.html_report = False

        elif opt in ('-m', '--meta'):
            qconfig.meta = True

        elif opt in ('-l', '--labels'):
            labels = parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            all_labels_from_dirs = True
        else:
            logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2)

    for contigs_fpath in contigs_fpaths:
        assert_file_exists(contigs_fpath, 'contigs')

    labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    output_dirpath, json_output_dirpath, existing_alignments = \
        _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    logger.print_command_line([os.path.realpath(__file__)] + args, wrap_after=None)
    logger.start()

    if existing_alignments:
        logger.info()
        logger.notice("Output directory already exists. Existing Nucmer alignments can be used.")
        qutils.remove_reports(output_dirpath)

    if qconfig.contig_thresholds == "None":
        qconfig.contig_thresholds = []
    else:
        qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(","))
    if qconfig.genes_lengths == "None":
        qconfig.genes_lengths = []
    else:
        qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(","))

    # Threading
    if qconfig.max_threads is None:
        try:
            import multiprocessing
            qconfig.max_threads = multiprocessing.cpu_count()
        except:
            logger.warning('Failed to determine the number of CPUs')
            qconfig.max_threads = qconfig.DEFAULT_MAX_THREADS

        logger.info()
        logger.notice('Maximum number of threads is set to ' + str(qconfig.max_threads) + ' (use --threads option to set it manually)')


    ########################################################################
    from libs import reporting
    reload(reporting)

    if os.path.isdir(corrected_dirpath):
        shutil.rmtree(corrected_dirpath)
    os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCE
    if ref_fpath:
        logger.info()
        logger.info('Reference:')
        ref_fpath = _correct_reference(ref_fpath, corrected_dirpath)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.info()
    logger.info('Contigs:')
    contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    if not contigs_fpaths:
        logger.error("None of the assembly files contains correct contigs. "
              "Please, provide different files or decrease --min-contig threshold.",
              fake_if_nested_run=True)
        return 4

    if qconfig.with_gage:
        ########################################################################
        ### GAGE
        ########################################################################
        if not ref_fpath:
            logger.warning("GAGE can't be run without a reference and will be skipped.")
        else:
            from libs import gage
            gage.do(ref_fpath, contigs_fpaths, output_dirpath)

    # Where all pdfs will be saved
    all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)
    all_pdf_file = None

    if qconfig.draw_plots:
        from libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.
        try:
            from matplotlib.backends.backend_pdf import PdfPages
            all_pdf_file = PdfPages(all_pdf_fpath)
        except:
            all_pdf_file = None

    ########################################################################
    ### Stats and plots
    ########################################################################
    from libs import basic_stats
    basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'),
                   json_output_dirpath, output_dirpath)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from libs import aligned_stats
        aligned_stats.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from libs import genome_analyzer
        genome_analyzer.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats'))


    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots:
        logger.print_timestamp()
        logger.info('Drawing large plots...')
        logger.info('This may take a while: press Ctrl-C to skip this step..')
        try:
            number_of_steps = sum([int(bool(value)) for value in [detailed_contigs_reports_dirpath, all_pdf_file]])
            if detailed_contigs_reports_dirpath:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.info('  1 of %d: Creating contig alignment plot...' % number_of_steps)
                from libs import contig_alignment_plotter
                contig_alignment_plot_fpath = contig_alignment_plotter.do(
                    contigs_fpaths, os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout'),
                    output_dirpath, ref_fpath, similar=True)

            if all_pdf_file:
                # full report in PDF format: all tables and plots
                logger.info('  %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_file)
            logger.info('Done')
        except KeyboardInterrupt:
            logger.info('..step skipped!')
            os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.info('RESULTS:')
    logger.info('  Text versions of total report are saved to ' + reports_fpaths)
    logger.info('  Text versions of transposed total report are saved to ' + transposed_reports_fpaths)

    if json_output_dirpath:
        json_saver.save_total_report(json_output_dirpath, qconfig.min_contig)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_total_report(output_dirpath, qconfig.min_contig)

    if os.path.isfile(all_pdf_fpath):
        logger.info('  PDF version (tables and plots) saved to ' + all_pdf_fpath)

    if contig_alignment_plot_fpath:
        logger.info('  Contig alignment plot: %s' % contig_alignment_plot_fpath)

    _cleanup(corrected_dirpath)
    logger.finish_up(check_test=qconfig.test)
    return 0
Beispiel #11
0
def main(args):
    if ' ' in qconfig.QUAST_HOME:
        logger.error('QUAST does not support spaces in paths. \n'
                     'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n'
                     'Please, put QUAST in a different directory, then try again.\n',
                     to_stderr=True,
                     exit_with_code=3)

    if not args:
        qconfig.usage()
        sys.exit(0)

    reload(qconfig)

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage()
        sys.exit(2)

    for opt, arg in options[:]:

        if opt == '--test' or opt == '--test-sv':
            options.remove((opt, arg))
            options += [('-o', 'quast_test_output'),
                        ('-R', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reference.fasta.gz')),  # for compiling MUMmer
                        ('-O', os.path.join(qconfig.QUAST_HOME, 'test_data', 'operons.gff')),
                        ('-G', os.path.join(qconfig.QUAST_HOME, 'test_data', 'genes.gff')),
                        ('--gage', ''),  # for compiling GAGE Java classes
                        ('--gene-finding', ''), ('--eukaryote', ''), ('--glimmer', '')]  # for compiling GlimmerHMM
            if opt == '--test-sv':
                options += [('-1', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads1.fastq.gz')),
                            ('-2', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads2.fastq.gz'))]
            contigs_fpaths += [os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_1.fasta'),
                               os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_2.fasta')]
            qconfig.test = True

        if opt.startswith('--help') or opt == '-h':
            qconfig.usage(opt == "--help-hidden", short=False)
            sys.exit(0)

        elif opt.startswith('--version') or opt == '-v':
            qconfig.print_version()
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage()
        sys.exit(2)

    json_output_dirpath = None
    output_dirpath = None

    labels = None
    all_labels_from_dirs = False
    qconfig.is_combined_ref = False

    ref_fpath = ''
    genes_fpaths = []
    operons_fpaths = []
    bed_fpath = None
    reads_fpath_f = ''
    reads_fpath_r = ''

    # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7.
    for opt, arg in options:
        if opt in ('-d', '--debug'):
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        elif opt in ('-o', "--output-dir"):
            output_dirpath = os.path.abspath(arg)
            qconfig.make_latest_symlink = False
            if ' ' in output_dirpath:
                logger.error('QUAST does not support spaces in paths. \n'
                             'You have specified ' + str(output_dirpath) + ' as an output path.\n'
                             'Please, use a different directory.\n',
                             to_stderr=True,
                             exit_with_code=3)

        elif opt in ('-G', "--genes"):
            genes_fpaths.append(assert_file_exists(arg, 'genes'))

        elif opt in ('-O', "--operons"):
            operons_fpaths.append(assert_file_exists(arg, 'operons'))

        elif opt in ('-R', "--reference"):
            ref_fpath = assert_file_exists(arg, 'reference')

        elif opt == "--contig-thresholds":
            qconfig.contig_thresholds = arg

        elif opt in ('-m', "--min-contig"):
            qconfig.min_contig = int(arg)

        elif opt in ('-t', "--threads"):
            qconfig.max_threads = int(arg)
            if qconfig.max_threads < 1:
                qconfig.max_threads = 1

        elif opt in ('-c', "--min-cluster"):
            qconfig.min_cluster = int(arg)

        elif opt in ('-i', "--min-alignment"):
            qconfig.min_alignment = int(arg)

        elif opt == "--est-ref-size":
            qconfig.estimated_reference_size = int(arg)

        elif opt == "--gene-thresholds":
            qconfig.genes_lengths = arg

        elif opt in ('-j', '--save-json'):
            qconfig.save_json = True

        elif opt in ('-J', '--save-json-to'):
            qconfig.save_json = True
            qconfig.make_latest_symlink = False
            json_output_dirpath = arg

        elif opt == '--err-fpath':  # for web-quast
            qconfig.save_error = True
            qconfig.error_log_fname = arg

        elif opt in ('-s', "--scaffolds"):
            qconfig.scaffolds = True

        elif opt == "--gage":
            qconfig.with_gage = True

        elif opt in ('-e', "--eukaryote"):
            qconfig.prokaryote = False

        elif opt in ('-f', "--gene-finding"):
            qconfig.gene_finding = True

        elif opt in ('-a', "--ambiguity-usage"):
            if arg in ["none", "one", "all"]:
                qconfig.ambiguity_usage = arg

        elif opt in ('-u', "--use-all-alignments"):
            qconfig.use_all_alignments = True

        elif opt == "--strict-NA":
            qconfig.strict_NA = True

        elif opt in ('-x', "--extensive-mis-size"):
            if int(arg) <= qconfig.MAX_INDEL_LENGTH:
                logger.error("--extensive-mis-size should be greater than maximum indel length (%d)!"
                             % qconfig.MAX_INDEL_LENGTH, 1, to_stderr=True)
            qconfig.extensive_misassembly_threshold = int(arg)

        elif opt == '--no-snps':
            qconfig.show_snps = False

        elif opt == '--no-plots':
            qconfig.draw_plots = False

        elif opt == '--no-html':
            qconfig.html_report = False

        elif opt == '--no-check':
            qconfig.no_check = True

        elif opt == '--no-gc':
            qconfig.no_gc = True

        elif opt == '--fast':  # --no-gc, --no-plots, --no-snps
            #qconfig.no_check = True  # too risky to include
            qconfig.no_gc = True
            qconfig.show_snps = False
            qconfig.draw_plots = False
            qconfig.html_report = False

        elif opt == '--plots-format':
            if arg.lower() in qconfig.supported_plot_extensions:
                qconfig.plot_extension = arg.lower()
            else:
                logger.error('Format "%s" is not supported. Please, use one of the supported formats: %s.' %
                             (arg, ', '.join(qconfig.supported_plot_extensions)), to_stderr=True, exit_with_code=2)

        elif opt == '--meta':
            qconfig.meta = True

        elif opt == '--no-check-meta':
            qconfig.no_check = True
            qconfig.no_check_meta = True

        elif opt == '--references-list':
            pass

        elif opt in ('-l', '--labels'):
            labels = parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            all_labels_from_dirs = True

        elif opt == '--glimmer':
            qconfig.glimmer = True

        elif opt == '--combined-ref':
            qconfig.is_combined_ref = True

        elif opt == '--memory-efficient':
            qconfig.memory_efficient = True

        elif opt == '--silent':
            qconfig.silent = True

        elif opt in ('-1', '--reads1'):
            reads_fpath_f = arg
        elif opt in ('-2', '--reads2'):
            reads_fpath_r = arg
        elif opt == '--bed-file':
            bed_fpath = arg

        elif opt == '--contig-alignment-html':
            qconfig.create_contig_alignment_html = True
        else:
            logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2)

    for contigs_fpath in contigs_fpaths:
        assert_file_exists(contigs_fpath, 'contigs')

    labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    output_dirpath, json_output_dirpath, existing_alignments = \
        _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    args = [os.path.realpath(__file__)]
    for k, v in options: args.extend([k, v])
    args.extend(contigs_fpaths)
    logger.print_command_line(args, wrap_after=None, is_main=True)
    logger.start()

    if existing_alignments:
        logger.main_info()
        logger.notice("Output directory already exists. Existing Nucmer alignments can be used.")
        qutils.remove_reports(output_dirpath)

    if qconfig.contig_thresholds == "None":
        qconfig.contig_thresholds = []
    else:
        qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(","))
    if qconfig.genes_lengths == "None":
        qconfig.genes_lengths = []
    else:
        qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(","))

    qconfig.set_max_threads(logger)

    logger.main_info()
    logger.print_params()

    ########################################################################
    from libs import reporting
    reload(reporting)

    if qconfig.is_combined_ref:
        corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname)
    else:
        if os.path.isdir(corrected_dirpath):
            shutil.rmtree(corrected_dirpath)
        os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCE
    if ref_fpath:
        logger.main_info()
        logger.main_info('Reference:')
        ref_fpath = _correct_reference(ref_fpath, corrected_dirpath)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')

    contigs_fpaths, old_contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    reads_fpaths = []
    if reads_fpath_f:
        reads_fpaths.append(reads_fpath_f)
    if reads_fpath_r:
        reads_fpaths.append(reads_fpath_r)
    if reads_fpaths:
        bed_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths, reads_fpaths, None,
                                      os.path.join(output_dirpath, qconfig.variation_dirname),
                                      external_logger=logger)

    if not contigs_fpaths:
        logger.error("None of the assembly files contains correct contigs. "
              "Please, provide different files or decrease --min-contig threshold.",
              fake_if_nested_run=True)
        return 4

    qconfig.assemblies_fpaths = contigs_fpaths
    if qconfig.with_gage:
        ########################################################################
        ### GAGE
        ########################################################################
        if not ref_fpath:
            logger.warning("GAGE can't be run without a reference and will be skipped.")
        else:
            from libs import gage
            gage.do(ref_fpath, contigs_fpaths, output_dirpath)

    # Where all pdfs will be saved
    all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)
    all_pdf_file = None

    if qconfig.draw_plots or qconfig.html_report:
        from libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.
        try:
            from matplotlib.backends.backend_pdf import PdfPages
            all_pdf_file = PdfPages(all_pdf_fpath)
        except:
            all_pdf_file = None

    if json_output_dirpath:
        from libs.html_saver import json_saver
        if json_saver.simplejson_error:
            json_output_dirpath = None


    ########################################################################
    ### Stats and plots
    ########################################################################
    from libs import basic_stats
    basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'),
                   json_output_dirpath, output_dirpath)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from libs import contigs_analyzer
        nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports'), old_contigs_fpaths, bed_fpath)
        for contigs_fpath in contigs_fpaths:
            if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from libs import aligned_stats
        aligned_stats.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from libs import genome_analyzer
        genome_analyzer.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats'))

    if qconfig.gene_finding or qconfig.glimmer:
        if qconfig.glimmer:
            ########################################################################
            ### Glimmer
            ########################################################################
            from libs import glimmer
            glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'))
        else:
            ########################################################################
            ### GeneMark
            ########################################################################
            from libs import genemark
            genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.prokaryote,
                        qconfig.meta)
            
    else:
        logger.main_info("")
        logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.")
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots:
        logger.print_timestamp()
        logger.main_info('Drawing large plots...')
        logger.main_info('This may take a while: press Ctrl-C to skip this step..')
        try:
            if detailed_contigs_reports_dirpath and qconfig.show_snps:
                contig_report_fpath_pattern = os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout')
            else:
                contig_report_fpath_pattern = None
            number_of_steps = sum([int(bool(value)) for value in [contig_report_fpath_pattern, all_pdf_file]])
            if contig_report_fpath_pattern:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.main_info('  1 of %d: Creating contig alignment plot...' % number_of_steps)
                from libs import contig_alignment_plotter
                contig_alignment_plot_fpath = contig_alignment_plotter.do(
                    contigs_fpaths, contig_report_fpath_pattern,
                    output_dirpath, ref_fpath, similar=True)

            if all_pdf_file:
                # full report in PDF format: all tables and plots
                logger.main_info('  %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_file)
            logger.main_info('Done')
        except KeyboardInterrupt:
            logger.main_info('..step skipped!')
            os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.main_info('RESULTS:')
    logger.main_info('  Text versions of total report are saved to ' + reports_fpaths)
    logger.main_info('  Text versions of transposed total report are saved to ' + transposed_reports_fpaths)

    if json_output_dirpath:
        json_saver.save_total_report(json_output_dirpath, qconfig.min_contig, ref_fpath)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls)
        html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath)

    if os.path.isfile(all_pdf_fpath):
        logger.main_info('  PDF version (tables and plots) saved to ' + all_pdf_fpath)

    if contig_alignment_plot_fpath:
        logger.main_info('  Contig alignment plot: %s' % contig_alignment_plot_fpath)

    _cleanup(corrected_dirpath)
    logger.finish_up(check_test=qconfig.test)
    return 0
Beispiel #12
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.info('Running NA-NGA calculation...')

    reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(sum(fastaparser.get_lengths_from_fastafile(contigs_fpath)))

    import N50
    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        nga50 = N50.NG50(lens, reference_length)
        na75 = N50.NG50(lens, assembly_len, 75)
        nga75 = N50.NG50(lens, reference_length, 75)
        la50 = N50.LG50(lens, assembly_len)
        lga50 = N50.LG50(lens, reference_length)
        la75 = N50.LG50(lens, assembly_len, 75)
        lga75 = N50.LG50(lens, reference_length, 75)
        logger.info('  ' +
                    qutils.index_to_str(i) +
                    qutils.label_from_fpath(contigs_fpath) +
                 ', Largest alignment = ' + str(max(lens)) +
                 ', NA50 = ' + str(na50) +
                 ', NGA50 = ' + str(nga50) +
                 ', LA50 = ' + str(la50) +
                 ', LGA50 = ' + str(lga50))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NGA50, nga50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.NGA75, nga75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LGA50, lga50)
        report.add_field(reporting.Fields.LA75, la75)
        report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    # saving to JSON
    if json_output_dirpath:
        from libs.html_saver import json_saver
        json_saver.save_aligned_contigs_lengths(json_output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists)
        json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_aligned_contigs_lengths(output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists)
        html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        import plotter
        plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
                                os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
                                'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
        plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths)
        plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))])

    logger.info('Done.')
    return report_dict
Beispiel #13
0
def do(ref_fpath, contigs_fpaths, output_dirpath):
    gage_results_dirpath = os.path.join(output_dirpath, 'gage')

    # suffixes for files with report tables in plain text and tab separated formats
    if not os.path.isdir(gage_results_dirpath):
        os.mkdir(gage_results_dirpath)

    ########################################################################
    gage_tool_path = os.path.join(qconfig.LIBS_LOCATION, 'gage', 'getCorrectnessStats.sh')

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running GAGE...')

    metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases',
               'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs',
               'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp',
               'Indels >= 5', 'Inversions', 'Relocation', 'Translocation',
               'Total units', 'BasesInFasta', 'Min', 'Max', 'N50']
    metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, 
                            reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE,
                            reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, 
                            reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, 
                            reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, 
                            reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, 
                            reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, 
                            reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, 
                            reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50]

    tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp')
    if not os.path.exists(tmp_dirpath):
        os.makedirs(tmp_dirpath)

    n_jobs = min(len(contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath)
        for i, contigs_fpath in enumerate(contigs_fpaths))

    if 0 not in return_codes:
        logger.warning('Error occurred while GAGE was processing assemblies.'
                       ' See GAGE error logs for details: %s' %
                os.path.join(gage_results_dirpath, 'gage_*.stderr'))
        return

    ## find metrics for total report:
    for i, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

        report = reporting.get(contigs_fpath)

        log_out_fpath = os.path.join(
            gage_results_dirpath, 'gage_' + assembly_label + '.stdout')
        logfile_out = open(log_out_fpath, 'r')
        cur_metric_id = 0
        for line in logfile_out:
            if metrics[cur_metric_id] in line:
                if (metrics[cur_metric_id].startswith('N50')):
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip())
                else:
                    report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip())
                cur_metric_id += 1
                if cur_metric_id == len(metrics):
                    break
        logfile_out.close()

    reporting.save_gage(output_dirpath)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    logger.main_info('Done.')
Beispiel #14
0
def main(args):
    if ' ' in quast_dirpath:
        logger.error('QUAST does not support spaces in paths. \n'
                     'You are trying to run it from ' + str(quast_dirpath) + '\n'
                     'Please, put QUAST in a different directory, then try again.\n',
                     to_stderr=True,
                     exit_with_code=3)

    if not args:
        qconfig.usage()
        sys.exit(0)

    reload(qconfig)

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage()
        sys.exit(2)

    for opt, arg in options[:]:
        if opt in ('-d', '--debug'):
            options.remove((opt, arg))
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        if opt == '--test':
            options.remove((opt, arg))
            options += [('-o', 'quast_test_output'),
                        ('-R', 'test_data/reference.fasta.gz'),   # for compiling MUMmer
                        ('-O', 'test_data/operons.gff'),
                        ('-G', 'test_data/genes.gff'),
                        ('--gene-finding',''), ('--eukaryote','')] # for compiling GlimmerHMM
            contigs_fpaths += ['test_data/contigs_1.fasta',
                               'test_data/contigs_2.fasta']
            qconfig.test = True

        if opt.startswith('--help'):
            qconfig.usage(opt == "--help-hidden")
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage()
        sys.exit(2)

    json_output_dirpath = None
    output_dirpath = None

    labels = None
    all_labels_from_dirs = False

    ref_fpath = ''
    genes_fpaths = []
    operons_fpaths = []

    # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7.
    for opt, arg in options:
        if opt in ('-o', "--output-dir"):
            output_dirpath = os.path.abspath(arg)
            qconfig.make_latest_symlink = False

        elif opt in ('-G', "--genes"):
            genes_fpaths.append(assert_file_exists(arg, 'genes'))

        elif opt in ('-O', "--operons"):
            operons_fpaths.append(assert_file_exists(arg, 'operons'))

        elif opt in ('-R', "--reference"):
            ref_fpath = assert_file_exists(arg, 'reference')

        elif opt in ('-t', "--contig-thresholds"):
            qconfig.contig_thresholds = arg

        elif opt in ('-M', "--min-contig"):
            qconfig.min_contig = int(arg)

        elif opt in ('-T', "--threads"):
            qconfig.max_threads = int(arg)
            if qconfig.max_threads < 1:
                qconfig.max_threads = 1

        elif opt in ('-c', "--mincluster"):
            qconfig.mincluster = int(arg)

        elif opt == "--est-ref-size":
            qconfig.estimated_reference_size = int(arg)

        elif opt in ('-S', "--gene-thresholds"):
            qconfig.genes_lengths = arg

        elif opt in ('-j', '--save-json'):
            qconfig.save_json = True

        elif opt in ('-J', '--save-json-to'):
            qconfig.save_json = True
            qconfig.make_latest_symlink = False
            json_output_dirpath = arg

        elif opt in ('-s', "--scaffolds"):
            qconfig.scaffolds = True

        elif opt == "--gage":
            qconfig.with_gage = True

        elif opt in ('-e', "--eukaryote"):
            qconfig.prokaryote = False

        elif opt in ('-f', "--gene-finding"):
            qconfig.gene_finding = True

        elif opt in ('-a', "--ambiguity-usage"):
            if arg in ["none", "one", "all"]:
                qconfig.ambiguity_usage = arg

        elif opt in ('-u', "--use-all-alignments"):
            qconfig.use_all_alignments = True

        elif opt in ('-n', "--strict-NA"):
            qconfig.strict_NA = True

        elif opt == '--no-plots':
            qconfig.draw_plots = False

        elif opt == '--no-html':
            qconfig.html_report = False

        elif opt in ('-m', '--meta'):
            qconfig.meta = True

        elif opt in ('-l', '--labels'):
            labels = parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            all_labels_from_dirs = True
        else:
            logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2)

    for contigs_fpath in contigs_fpaths:
        assert_file_exists(contigs_fpath, 'contigs')

    labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    output_dirpath, json_output_dirpath, existing_alignments = \
        _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    logger.print_command_line([os.path.realpath(__file__)] + args, wrap_after=None)
    logger.start()

    if existing_alignments:
        logger.info()
        logger.notice("Output directory already exists. Existing Nucmer alignments can be used.")
        qutils.remove_reports(output_dirpath)

    if qconfig.contig_thresholds == "None":
        qconfig.contig_thresholds = []
    else:
        qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(","))
    if qconfig.genes_lengths == "None":
        qconfig.genes_lengths = []
    else:
        qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(","))

    # Threading
    if qconfig.max_threads is None:
        try:
            import multiprocessing
            qconfig.max_threads = multiprocessing.cpu_count()
        except:
            logger.warning('Failed to determine the number of CPUs')
            qconfig.max_threads = qconfig.DEFAULT_MAX_THREADS

        logger.info()
        logger.notice('Maximum number of threads is set to ' + str(qconfig.max_threads) + ' (use --threads option to set it manually)')


    ########################################################################
    from libs import reporting
    reload(reporting)

    if os.path.isdir(corrected_dirpath):
        shutil.rmtree(corrected_dirpath)
    os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCE
    if ref_fpath:
        logger.info()
        logger.info('Reference:')
        ref_fpath = _correct_reference(ref_fpath, corrected_dirpath)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.info()
    logger.info('Contigs:')
    contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    if not contigs_fpaths:
        logger.error("None of the assembly files contains correct contigs. "
              "Please, provide different files or decrease --min-contig threshold.",
              fake_if_nested_run=True)
        return 4

    if qconfig.with_gage:
        ########################################################################
        ### GAGE
        ########################################################################
        if not ref_fpath:
            logger.warning("GAGE can't be run without a reference and will be skipped.")
        else:
            from libs import gage
            gage.do(ref_fpath, contigs_fpaths, output_dirpath)

    # Where all pdfs will be saved
    all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)
    all_pdf_file = None

    if qconfig.draw_plots:
        from libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.
        try:
            from matplotlib.backends.backend_pdf import PdfPages
            all_pdf_file = PdfPages(all_pdf_fpath)
        except:
            all_pdf_file = None

    ########################################################################
    ### Stats and plots
    ########################################################################
    from libs import basic_stats
    basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'),
                   json_output_dirpath, output_dirpath)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from libs import contigs_analyzer
        nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports'))
        for contigs_fpath in contigs_fpaths:
            if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from libs import aligned_stats
        aligned_stats.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from libs import genome_analyzer
        genome_analyzer.do(
            ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
            genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats'))

    if qconfig.gene_finding:
        if qconfig.prokaryote or qconfig.meta:
            ########################################################################
            ### GeneMark
            ########################################################################
            from libs import genemark
            genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'),
                        qconfig.meta)
        else:
            ########################################################################
            ### Glimmer
            ########################################################################
            from libs import glimmer
            glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'))
    else:
        logger.info("")
        logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.")
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots:
        logger.print_timestamp()
        logger.info('Drawing large plots...')
        logger.info('This may take a while: press Ctrl-C to skip this step..')
        try:
            number_of_steps = sum([int(bool(value)) for value in [detailed_contigs_reports_dirpath, all_pdf_file]])
            if detailed_contigs_reports_dirpath:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.info('  1 of %d: Creating contig alignment plot...' % number_of_steps)
                from libs import contig_alignment_plotter
                contig_alignment_plot_fpath = contig_alignment_plotter.do(
                    contigs_fpaths, os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout'),
                    output_dirpath, ref_fpath, similar=True)

            if all_pdf_file:
                # full report in PDF format: all tables and plots
                logger.info('  %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_file)
            logger.info('Done')
        except KeyboardInterrupt:
            logger.info('..step skipped!')
            os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.info('RESULTS:')
    logger.info('  Text versions of total report are saved to ' + reports_fpaths)
    logger.info('  Text versions of transposed total report are saved to ' + transposed_reports_fpaths)

    if json_output_dirpath:
        json_saver.save_total_report(json_output_dirpath, qconfig.min_contig)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_total_report(output_dirpath, qconfig.min_contig)

    if os.path.isfile(all_pdf_fpath):
        logger.info('  PDF version (tables and plots) saved to ' + all_pdf_fpath)

    if contig_alignment_plot_fpath:
        logger.info('  Contig alignment plot: %s' % contig_alignment_plot_fpath)

    _cleanup(corrected_dirpath)
    logger.finish_up(check_test=qconfig.test)
    return 0
Beispiel #15
0
def main(args):
    if ' ' in qconfig.QUAST_HOME:
        logger.error(
            'QUAST does not support spaces in paths. \n'
            'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n'
            'Please, put QUAST in a different directory, then try again.\n',
            to_stderr=True,
            exit_with_code=3)

    if not args:
        qconfig.usage()
        sys.exit(0)

    reload(qconfig)

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args,
                                                    qconfig.short_options,
                                                    qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage()
        sys.exit(2)

    for opt, arg in options[:]:

        if opt == '--test' or opt == '--test-sv':
            options.remove((opt, arg))
            options += [
                ('-o', 'quast_test_output'),
                ('-R',
                 os.path.join(qconfig.QUAST_HOME, 'test_data',
                              'reference.fasta.gz')),  # for compiling MUMmer
                ('-O',
                 os.path.join(qconfig.QUAST_HOME, 'test_data', 'operons.gff')),
                ('-G',
                 os.path.join(qconfig.QUAST_HOME, 'test_data', 'genes.gff')),
                ('--gage', ''),  # for compiling GAGE Java classes
                ('--gene-finding', ''),
                ('--eukaryote', ''),
                ('--glimmer', '')
            ]  # for compiling GlimmerHMM
            if opt == '--test-sv':
                options += [('-1',
                             os.path.join(qconfig.QUAST_HOME, 'test_data',
                                          'reads1.fastq.gz')),
                            ('-2',
                             os.path.join(qconfig.QUAST_HOME, 'test_data',
                                          'reads2.fastq.gz'))]
            contigs_fpaths += [
                os.path.join(qconfig.QUAST_HOME, 'test_data',
                             'contigs_1.fasta'),
                os.path.join(qconfig.QUAST_HOME, 'test_data',
                             'contigs_2.fasta')
            ]
            qconfig.test = True

        if opt.startswith('--help') or opt == '-h':
            qconfig.usage(opt == "--help-hidden", short=False)
            sys.exit(0)

        elif opt.startswith('--version') or opt == '-v':
            qconfig.print_version()
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage()
        sys.exit(2)

    json_output_dirpath = None
    output_dirpath = None

    labels = None
    all_labels_from_dirs = False
    qconfig.is_combined_ref = False

    ref_fpath = ''
    genes_fpaths = []
    operons_fpaths = []
    bed_fpath = None
    reads_fpath_f = ''
    reads_fpath_r = ''

    # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7.
    for opt, arg in options:
        if opt in ('-d', '--debug'):
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        elif opt in ('-o', "--output-dir"):
            output_dirpath = os.path.abspath(arg)
            qconfig.make_latest_symlink = False
            if ' ' in output_dirpath:
                logger.error('QUAST does not support spaces in paths. \n'
                             'You have specified ' + str(output_dirpath) +
                             ' as an output path.\n'
                             'Please, use a different directory.\n',
                             to_stderr=True,
                             exit_with_code=3)

        elif opt in ('-G', "--genes"):
            genes_fpaths.append(assert_file_exists(arg, 'genes'))

        elif opt in ('-O', "--operons"):
            operons_fpaths.append(assert_file_exists(arg, 'operons'))

        elif opt in ('-R', "--reference"):
            ref_fpath = assert_file_exists(arg, 'reference')

        elif opt == "--contig-thresholds":
            qconfig.contig_thresholds = arg

        elif opt in ('-m', "--min-contig"):
            qconfig.min_contig = int(arg)

        elif opt in ('-t', "--threads"):
            qconfig.max_threads = int(arg)
            if qconfig.max_threads < 1:
                qconfig.max_threads = 1

        elif opt in ('-c', "--min-cluster"):
            qconfig.min_cluster = int(arg)

        elif opt in ('-i', "--min-alignment"):
            qconfig.min_alignment = int(arg)

        elif opt == "--est-ref-size":
            qconfig.estimated_reference_size = int(arg)

        elif opt == "--gene-thresholds":
            qconfig.genes_lengths = arg

        elif opt in ('-j', '--save-json'):
            qconfig.save_json = True

        elif opt in ('-J', '--save-json-to'):
            qconfig.save_json = True
            qconfig.make_latest_symlink = False
            json_output_dirpath = arg

        elif opt == '--err-fpath':  # for web-quast
            qconfig.save_error = True
            qconfig.error_log_fname = arg

        elif opt in ('-s', "--scaffolds"):
            qconfig.scaffolds = True

        elif opt == "--gage":
            qconfig.with_gage = True

        elif opt in ('-e', "--eukaryote"):
            qconfig.prokaryote = False

        elif opt in ('-f', "--gene-finding"):
            qconfig.gene_finding = True

        elif opt in ('-a', "--ambiguity-usage"):
            if arg in ["none", "one", "all"]:
                qconfig.ambiguity_usage = arg

        elif opt in ('-u', "--use-all-alignments"):
            qconfig.use_all_alignments = True

        elif opt == "--strict-NA":
            qconfig.strict_NA = True

        elif opt in ('-x', "--extensive-mis-size"):
            if int(arg) <= qconfig.MAX_INDEL_LENGTH:
                logger.error(
                    "--extensive-mis-size should be greater than maximum indel length (%d)!"
                    % qconfig.MAX_INDEL_LENGTH,
                    1,
                    to_stderr=True)
            qconfig.extensive_misassembly_threshold = int(arg)

        elif opt == '--no-snps':
            qconfig.show_snps = False

        elif opt == '--no-plots':
            qconfig.draw_plots = False

        elif opt == '--no-html':
            qconfig.html_report = False

        elif opt == '--no-check':
            qconfig.no_check = True

        elif opt == '--no-gc':
            qconfig.no_gc = True

        elif opt == '--fast':  # --no-gc, --no-plots, --no-snps
            #qconfig.no_check = True  # too risky to include
            qconfig.no_gc = True
            qconfig.show_snps = False
            qconfig.draw_plots = False
            qconfig.html_report = False

        elif opt == '--plots-format':
            if arg.lower() in qconfig.supported_plot_extensions:
                qconfig.plot_extension = arg.lower()
            else:
                logger.error(
                    'Format "%s" is not supported. Please, use one of the supported formats: %s.'
                    % (arg, ', '.join(qconfig.supported_plot_extensions)),
                    to_stderr=True,
                    exit_with_code=2)

        elif opt == '--meta':
            qconfig.meta = True

        elif opt == '--no-check-meta':
            qconfig.no_check = True
            qconfig.no_check_meta = True

        elif opt == '--references-list':
            pass

        elif opt in ('-l', '--labels'):
            labels = parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            all_labels_from_dirs = True

        elif opt == '--glimmer':
            qconfig.glimmer = True

        elif opt == '--combined-ref':
            qconfig.is_combined_ref = True

        elif opt == '--memory-efficient':
            qconfig.memory_efficient = True

        elif opt == '--silent':
            qconfig.silent = True

        elif opt in ('-1', '--reads1'):
            reads_fpath_f = arg
        elif opt in ('-2', '--reads2'):
            reads_fpath_r = arg
        elif opt == '--bed-file':
            bed_fpath = arg

        elif opt == '--contig-alignment-html':
            qconfig.create_contig_alignment_html = True
        else:
            logger.error('Unknown option: %s. Use -h for help.' %
                         (opt + ' ' + arg),
                         to_stderr=True,
                         exit_with_code=2)

    for contigs_fpath in contigs_fpaths:
        assert_file_exists(contigs_fpath, 'contigs')

    labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    output_dirpath, json_output_dirpath, existing_alignments = \
        _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    args = [os.path.realpath(__file__)]
    for k, v in options:
        args.extend([k, v])
    args.extend(contigs_fpaths)
    logger.print_command_line(args, wrap_after=None, is_main=True)
    logger.start()

    if existing_alignments:
        logger.main_info()
        logger.notice(
            "Output directory already exists. Existing Nucmer alignments can be used."
        )
        qutils.remove_reports(output_dirpath)

    if qconfig.contig_thresholds == "None":
        qconfig.contig_thresholds = []
    else:
        qconfig.contig_thresholds = map(int,
                                        qconfig.contig_thresholds.split(","))
    if qconfig.genes_lengths == "None":
        qconfig.genes_lengths = []
    else:
        qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(","))

    qconfig.set_max_threads(logger)

    logger.main_info()
    logger.print_params()

    ########################################################################
    from libs import reporting
    reload(reporting)

    if qconfig.is_combined_ref:
        corrected_dirpath = os.path.join(output_dirpath, '..',
                                         qconfig.corrected_dirname)
    else:
        if os.path.isdir(corrected_dirpath):
            shutil.rmtree(corrected_dirpath)
        os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCE
    if ref_fpath:
        logger.main_info()
        logger.main_info('Reference:')
        ref_fpath = _correct_reference(ref_fpath, corrected_dirpath)
    else:
        ref_fpath = ''

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')

    contigs_fpaths, old_contigs_fpaths = _correct_contigs(
        contigs_fpaths, corrected_dirpath, reporting, labels)
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.NAME,
                         qutils.label_from_fpath(contigs_fpath))

    qconfig.assemblies_num = len(contigs_fpaths)

    reads_fpaths = []
    if reads_fpath_f:
        reads_fpaths.append(reads_fpath_f)
    if reads_fpath_r:
        reads_fpaths.append(reads_fpath_r)
    if reads_fpaths:
        bed_fpath = reads_analyzer.do(ref_fpath,
                                      contigs_fpaths,
                                      reads_fpaths,
                                      None,
                                      os.path.join(output_dirpath,
                                                   qconfig.variation_dirname),
                                      external_logger=logger)

    if not contigs_fpaths:
        logger.error(
            "None of the assembly files contains correct contigs. "
            "Please, provide different files or decrease --min-contig threshold.",
            fake_if_nested_run=True)
        return 4

    qconfig.assemblies_fpaths = contigs_fpaths
    if qconfig.with_gage:
        ########################################################################
        ### GAGE
        ########################################################################
        if not ref_fpath:
            logger.warning(
                "GAGE can't be run without a reference and will be skipped.")
        else:
            from libs import gage
            gage.do(ref_fpath, contigs_fpaths, output_dirpath)

    # Where all pdfs will be saved
    all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname)
    all_pdf_file = None

    if qconfig.draw_plots or qconfig.html_report:
        from libs import plotter  # Do not remove this line! It would lead to a warning in matplotlib.
        try:
            from matplotlib.backends.backend_pdf import PdfPages
            all_pdf_file = PdfPages(all_pdf_fpath)
        except:
            all_pdf_file = None

    if json_output_dirpath:
        from libs.html_saver import json_saver
        if json_saver.simplejson_error:
            json_output_dirpath = None

    ########################################################################
    ### Stats and plots
    ########################################################################
    from libs import basic_stats
    basic_stats.do(ref_fpath, contigs_fpaths,
                   os.path.join(output_dirpath, 'basic_stats'),
                   json_output_dirpath, output_dirpath)

    aligned_contigs_fpaths = []
    aligned_lengths_lists = []
    contig_alignment_plot_fpath = None
    if ref_fpath:
        ########################################################################
        ### former PLANTAKOLYA, PLANTAGORA
        ########################################################################
        from libs import contigs_analyzer
        nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do(
            ref_fpath, contigs_fpaths, qconfig.prokaryote,
            os.path.join(output_dirpath, 'contigs_reports'),
            old_contigs_fpaths, bed_fpath)
        for contigs_fpath in contigs_fpaths:
            if nucmer_statuses[
                    contigs_fpath] == contigs_analyzer.NucmerStatus.OK:
                aligned_contigs_fpaths.append(contigs_fpath)
                aligned_lengths_lists.append(
                    aligned_lengths_per_fpath[contigs_fpath])

    # Before continue evaluating, check if nucmer didn't skip all of the contigs files.
    detailed_contigs_reports_dirpath = None
    if len(aligned_contigs_fpaths) and ref_fpath:
        detailed_contigs_reports_dirpath = os.path.join(
            output_dirpath, 'contigs_reports')

        ########################################################################
        ### NAx and NGAx ("aligned Nx and NGx")
        ########################################################################
        from libs import aligned_stats
        aligned_stats.do(ref_fpath, aligned_contigs_fpaths, output_dirpath,
                         json_output_dirpath, aligned_lengths_lists,
                         os.path.join(output_dirpath, 'aligned_stats'))

        ########################################################################
        ### GENOME_ANALYZER
        ########################################################################
        from libs import genome_analyzer
        genome_analyzer.do(ref_fpath, aligned_contigs_fpaths, output_dirpath,
                           json_output_dirpath, genes_fpaths, operons_fpaths,
                           detailed_contigs_reports_dirpath,
                           os.path.join(output_dirpath, 'genome_stats'))

    if qconfig.gene_finding or qconfig.glimmer:
        if qconfig.glimmer:
            ########################################################################
            ### Glimmer
            ########################################################################
            from libs import glimmer
            glimmer.do(contigs_fpaths, qconfig.genes_lengths,
                       os.path.join(output_dirpath, 'predicted_genes'))
        else:
            ########################################################################
            ### GeneMark
            ########################################################################
            from libs import genemark
            genemark.do(contigs_fpaths, qconfig.genes_lengths,
                        os.path.join(output_dirpath, 'predicted_genes'),
                        qconfig.prokaryote, qconfig.meta)

    else:
        logger.main_info("")
        logger.notice(
            "Genes are not predicted by default. Use --gene-finding option to enable it."
        )
    ########################################################################
    reports_fpaths, transposed_reports_fpaths = reporting.save_total(
        output_dirpath)

    ########################################################################
    ### LARGE DRAWING TASKS
    ########################################################################
    if qconfig.draw_plots:
        logger.print_timestamp()
        logger.main_info('Drawing large plots...')
        logger.main_info(
            'This may take a while: press Ctrl-C to skip this step..')
        try:
            if detailed_contigs_reports_dirpath and qconfig.show_snps:
                contig_report_fpath_pattern = os.path.join(
                    detailed_contigs_reports_dirpath,
                    'contigs_report_%s.stdout')
            else:
                contig_report_fpath_pattern = None
            number_of_steps = sum([
                int(bool(value))
                for value in [contig_report_fpath_pattern, all_pdf_file]
            ])
            if contig_report_fpath_pattern:
                ########################################################################
                ### VISUALIZE CONTIG ALIGNMENT
                ########################################################################
                logger.main_info(
                    '  1 of %d: Creating contig alignment plot...' %
                    number_of_steps)
                from libs import contig_alignment_plotter
                contig_alignment_plot_fpath = contig_alignment_plotter.do(
                    contigs_fpaths,
                    contig_report_fpath_pattern,
                    output_dirpath,
                    ref_fpath,
                    similar=True)

            if all_pdf_file:
                # full report in PDF format: all tables and plots
                logger.main_info(
                    '  %d of %d: Creating PDF with all tables and plots...' %
                    (number_of_steps, number_of_steps))
                plotter.fill_all_pdf_file(all_pdf_file)
            logger.main_info('Done')
        except KeyboardInterrupt:
            logger.main_info('..step skipped!')
            os.remove(all_pdf_fpath)

    ########################################################################
    ### TOTAL REPORT
    ########################################################################
    logger.print_timestamp()
    logger.main_info('RESULTS:')
    logger.main_info('  Text versions of total report are saved to ' +
                     reports_fpaths)
    logger.main_info(
        '  Text versions of transposed total report are saved to ' +
        transposed_reports_fpaths)

    if json_output_dirpath:
        json_saver.save_total_report(json_output_dirpath, qconfig.min_contig,
                                     ref_fpath)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_colors(output_dirpath, contigs_fpaths,
                               plotter.dict_color_and_ls)
        html_saver.save_total_report(output_dirpath, qconfig.min_contig,
                                     ref_fpath)

    if os.path.isfile(all_pdf_fpath):
        logger.main_info('  PDF version (tables and plots) saved to ' +
                         all_pdf_fpath)

    if contig_alignment_plot_fpath:
        logger.main_info('  Contig alignment plot: %s' %
                         contig_alignment_plot_fpath)

    _cleanup(corrected_dirpath)
    logger.finish_up(check_test=qconfig.test)
    return 0