Esempio n. 1
0
        elif opt in ("-m", "--meta"):
            pass
        elif opt in ["--no-plots"]:
            pass
        elif opt in ["--no-html"]:
            pass
        else:
            logger.error('Unknown option: %s. Use -h for help.' %
                         (opt + (' ' + arg) if arg else ''),
                         to_stderr=True)
            sys.exit(2)

    for c_fpath in contigs_fpaths:
        assert_file_exists(c_fpath, 'contigs')

    labels = quast.process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    for contigs_fpath in contigs_fpaths:
        quast_py_args.remove(contigs_fpath)

    # # Removing outout dir if exists
    # if output_dirpath:  # 'output dir was specified with -o option'
    #     if os.path.isdir(output_dirpath):
    #         shutil.rmtree(output_dirpath)

    # Directories
    output_dirpath, _, _ = quast._set_up_output_dir(output_dirpath,
                                                    None,
                                                    make_latest_symlink,
                                                    save_json=False,
                                                    remove_old=True)
Esempio n. 2
0
def main(args):
    if ' ' in quast_dirpath:
        logger.error('QUAST does not support spaces in paths. \n'
                     'You are trying to run it from ' + str(quast_dirpath) + '\n'
                     'Please, put QUAST in a different directory, then try again.\n',
                     to_stderr=True,
                     exit_with_code=3)

    if not args:
        qconfig.usage(meta=True)
        sys.exit(0)

    min_contig = qconfig.min_contig
    genes = []
    operons = []
    draw_plots = qconfig.draw_plots
    html_report = qconfig.html_report
    make_latest_symlink = True

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage(meta=True)
        sys.exit(2)

    quast_py_args = args[:]
    test_mode = False

    for opt, arg in options:
        if opt in ('-d', '--debug'):
            options.remove((opt, arg))
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        elif opt == '--test':
            options.remove((opt, arg))
            quast_py_args.remove(opt)
            options += [('-o', 'quast_test_output'),
                        ('-R', 'test_data/meta_ref_1.fasta,'
                               'test_data/meta_ref_2.fasta,'
                               'test_data/meta_ref_3.fasta')]
            contigs_fpaths += ['test_data/meta_contigs_1.fasta',
                               'test_data/meta_contigs_2.fasta']
            test_mode = True

        elif opt.startswith('--help'):
            qconfig.usage(opt == "--help-hidden", meta=True)
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage(meta=True)
        sys.exit(2)

    ref_fpaths = []
    combined_ref_fpath = ''

    output_dirpath = None

    labels = None
    all_labels_from_dirs = False

    for opt, arg in options:
        if opt in ('-o', "--output-dir"):
            # Removing output dir arg in order to further
            # construct other quast calls from this options
            if opt in quast_py_args and arg in quast_py_args:
                quast_py_args.remove(opt)
                quast_py_args.remove(arg)

            output_dirpath = os.path.abspath(arg)
            make_latest_symlink = False

        elif opt in ('-G', "--genes"):
            assert_file_exists(arg, 'genes')
            genes += arg

        elif opt in ('-O', "--operons"):
            assert_file_exists(arg, 'operons')
            operons += arg

        elif opt in ('-R', "--reference"):
            # Removing reference args in order to further
            # construct quast calls from this args with other reference options
            if opt in quast_py_args and arg in quast_py_args:
                quast_py_args.remove(opt)
                quast_py_args.remove(arg)

            ref_fpaths = arg.split(',')
            for i, ref_fpath in enumerate(ref_fpaths):
                assert_file_exists(ref_fpath, 'reference')
                ref_fpaths[i] = ref_fpath

        elif opt in ('-M', "--min-contig"):
            min_contig = int(arg)

        elif opt in ('-T', "--threads"):
            pass

        elif opt in ('-l', '--labels'):
            quast_py_args.remove(opt)
            quast_py_args.remove(arg)
            labels = quast.parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            quast_py_args.remove(opt)
            all_labels_from_dirs = True

        elif opt in ('-j', '--save-json'):
            pass
        elif opt in ('-J', '--save-json-to'):
            pass
        elif opt in ('-t', "--contig-thresholds"):
            pass
        elif opt in ('-c', "--mincluster"):
            pass
        elif opt == "--est-ref-size":
            pass
        elif opt in ('-S', "--gene-thresholds"):
            pass
        elif opt in ('-s', "--scaffolds"):
            pass
        elif opt == "--gage":
            pass
        elif opt == "--debug":
            pass
        elif opt in ('-e', "--eukaryote"):
            pass
        elif opt in ('-f', "--gene-finding"):
            pass
        elif opt in ('-a', "--ambiguity-usage"):
            pass
        elif opt in ('-u', "--use-all-alignments"):
            pass
        elif opt in ('-n', "--strict-NA"):
            pass
        elif opt in ("-m", "--meta"):
            pass
        elif opt in ["--no-plots"]:
            pass
        elif opt in ["--no-html"]:
            pass
        else:
            logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2)

    for c_fpath in contigs_fpaths:
        assert_file_exists(c_fpath, 'contigs')

    labels = quast.process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    for contigs_fpath in contigs_fpaths:
        if contigs_fpath in quast_py_args:
            quast_py_args.remove(contigs_fpath)

    # Directories
    output_dirpath, _, _ = quast._set_up_output_dir(
        output_dirpath, None, make_latest_symlink,
        save_json=False)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    logger.print_command_line([os.path.realpath(__file__)] + args, wrap_after=None)
    logger.start()

    ########################################################################

    from libs import reporting
    reload(reporting)

    if os.path.isdir(corrected_dirpath):
        shutil.rmtree(corrected_dirpath)
    os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCES
    common_ref_fasta_ext = ''

    if ref_fpaths:
        logger.info()
        logger.info('Reference(s):')

        ref_fpaths, common_ref_fasta_ext, combined_ref_fpath =\
            _correct_refrences(ref_fpaths, corrected_dirpath)

    # PROCESSING CONTIGS
    logger.info()
    logger.info('Contigs:')
    assemblies = _correct_contigs(contigs_fpaths, corrected_dirpath, min_contig, labels)

    if not assemblies:
        logger.error("None of the assembly files contains correct contigs. "
                     "Please, provide different files or decrease --min-contig threshold.")
        return 4

    # Running QUAST(s)
    quast_py_args += ['--meta']

    if not ref_fpaths:
        # No references, running regular quast with MetaGenemark gene finder
        logger.info()
        logger.notice('No references provided, starting quast.py with MetaGeneMark gene finder')
        _start_quast_main(
            None,
            quast_py_args,
            assemblies=assemblies,
            output_dirpath=os.path.join(output_dirpath, 'quast_output'),
            exit_on_exception=True)
        exit(0)

    # Running combined reference
    run_name = 'for the combined reference'
    logger.info()
    logger.info('Starting quast.py ' + run_name + '...')
    total_num_notices = 0
    total_num_warnings = 0
    total_num_nf_errors = 0
    total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors)

    return_code, total_num_notifications = _start_quast_main(run_name, quast_py_args,
        assemblies=assemblies,
        reference_fpath=combined_ref_fpath,
        output_dirpath=os.path.join(output_dirpath, 'combined_quast_output'),
        num_notifications_tuple=total_num_notifications)

    # Partitioning contigs into bins aligned to each reference
    assemblies_by_reference, not_aligned_assemblies = _partition_contigs(
        assemblies, ref_fpaths, corrected_dirpath,
        os.path.join(output_dirpath, 'combined_quast_output', 'contigs_reports', 'alignments_%s.tsv'))

    for ref_name, ref_assemblies in assemblies_by_reference.iteritems():
        logger.info('')
        if not ref_assemblies:
            logger.info('No contigs were aligned to the reference ' + ref_name)
        else:
            run_name = 'for the contigs aligned to ' + ref_name
            logger.info('Starting quast.py ' + run_name)

            return_code, total_num_notifications = _start_quast_main(run_name, quast_py_args,
                assemblies=ref_assemblies,
                reference_fpath=os.path.join(corrected_dirpath, ref_name) + common_ref_fasta_ext,
                output_dirpath=os.path.join(output_dirpath, ref_name + '_quast_output'),
                exit_on_exception=False, num_notifications_tuple=total_num_notifications)

    # Finally running for the contigs that has not been aligned to any reference
    run_name = 'for the contigs not alined anywhere'
    logger.info()
    logger.info('Starting quast.py ' + run_name + '...')

    return_code, total_num_notifications = _start_quast_main(run_name, quast_py_args,
        assemblies=not_aligned_assemblies,
        output_dirpath=os.path.join(output_dirpath, 'not_aligned_quast_output'),
        exit_on_exception=False, num_notifications_tuple=total_num_notifications)

    if return_code not in [0, 4]:
        logger.error('Error running quast.py for the contigs not aligned anywhere')

    quast._cleanup(corrected_dirpath)

    logger.info('')
    logger.info('MetaQUAST finished.')
    logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)
Esempio n. 3
0
def main(args):
    if ' ' in qconfig.QUAST_HOME:
        logger.error(
            'QUAST does not support spaces in paths. \n'
            'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n'
            'Please, put QUAST in a different directory, then try again.\n',
            to_stderr=True,
            exit_with_code=3)

    if not args:
        qconfig.usage(meta=True)
        sys.exit(0)

    genes = []
    operons = []
    html_report = qconfig.html_report
    make_latest_symlink = True
    ref_txt_fpath = None

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args,
                                                    qconfig.short_options,
                                                    qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage(meta=True)
        sys.exit(2)

    quast_py_args = args[:]
    test_mode = False

    for opt, arg in options:
        if opt in ('-d', '--debug'):
            options.remove((opt, arg))
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        elif opt == '--test' or opt == '--test-no-ref':
            options.remove((opt, arg))
            quast_py_args = __remove_from_quast_py_args(quast_py_args, opt)
            options += [('-o', 'quast_test_output')]
            if opt == '--test':
                options += [('-R', ','.join([
                    os.path.join(qconfig.QUAST_HOME, 'test_data',
                                 'meta_ref_1.fasta'),
                    os.path.join(qconfig.QUAST_HOME, 'test_data',
                                 'meta_ref_2.fasta'),
                    os.path.join(qconfig.QUAST_HOME, 'test_data',
                                 'meta_ref_3.fasta')
                ]))]
            contigs_fpaths += [
                os.path.join(qconfig.QUAST_HOME, 'test_data',
                             'meta_contigs_1.fasta'),
                os.path.join(qconfig.QUAST_HOME, 'test_data',
                             'meta_contigs_2.fasta')
            ]
            test_mode = True

        elif opt.startswith('--help') or opt == '-h':
            qconfig.usage(opt == "--help-hidden", meta=True, short=False)
            sys.exit(0)

        elif opt.startswith('--version') or opt == '-v':
            qconfig.print_version(meta=True)
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage(meta=True)
        sys.exit(2)

    ref_fpaths = []
    combined_ref_fpath = ''
    reads_fpath_f = ''
    reads_fpath_r = ''
    output_dirpath = None

    labels = None
    all_labels_from_dirs = False

    for opt, arg in options:
        if opt in ('-o', "--output-dir"):
            # Removing output dir arg in order to further
            # construct other quast calls from this options
            if opt in quast_py_args and arg in quast_py_args:
                quast_py_args = __remove_from_quast_py_args(
                    quast_py_args, opt, arg)

            output_dirpath = os.path.abspath(arg)
            make_latest_symlink = False

        elif opt in ('-G', "--genes"):
            assert_file_exists(arg, 'genes')
            genes += arg

        elif opt in ('-O', "--operons"):
            assert_file_exists(arg, 'operons')
            operons += arg

        elif opt in ('-R', "--reference"):
            # Removing reference args in order to further
            # construct quast calls from this args with other reference options
            if opt in quast_py_args and arg in quast_py_args:
                quast_py_args = __remove_from_quast_py_args(
                    quast_py_args, opt, arg)
            if os.path.isdir(arg):
                ref_fpaths = [
                    os.path.join(path, file)
                    for (path, dirs, files) in os.walk(arg) for file in files
                    if qutils.check_is_fasta_file(file)
                ]
                ref_fpaths.sort()
            else:
                ref_fpaths = arg.split(',')
                for i, ref_fpath in enumerate(ref_fpaths):
                    assert_file_exists(ref_fpath, 'reference')
                    ref_fpaths[i] = ref_fpath

        elif opt == '--max-ref-number':
            quast_py_args = __remove_from_quast_py_args(
                quast_py_args, opt, arg)
            qconfig.max_references = int(arg)
            if qconfig.max_references < 0:
                qconfig.max_references = 0

        elif opt in ('-m', "--min-contig"):
            qconfig.min_contig = int(arg)

        elif opt in ('-t', "--threads"):
            qconfig.max_threads = int(arg)
            if qconfig.max_threads < 1:
                qconfig.max_threads = 1

        elif opt in ('-l', '--labels'):
            quast_py_args = __remove_from_quast_py_args(
                quast_py_args, opt, arg)
            labels = quast.parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            quast_py_args = __remove_from_quast_py_args(quast_py_args, opt)
            all_labels_from_dirs = True

        elif opt in ('-j', '--save-json'):
            pass
        elif opt in ('-J', '--save-json-to'):
            pass
        elif opt == "--contig-thresholds":
            pass
        elif opt in ('-c', "--mincluster"):
            pass
        elif opt == "--est-ref-size":
            pass
        elif opt == "--gene-thresholds":
            pass
        elif opt in ('-s', "--scaffolds"):
            pass
        elif opt == "--gage":
            pass
        elif opt == "--debug":
            pass
        elif opt in ('-e', "--eukaryote"):
            pass
        elif opt in ('-f', "--gene-finding"):
            pass
        elif opt in ('-i', "--min-alignment"):
            pass
        elif opt in ('-c', "--min-cluster"):
            pass
        elif opt in ('-a', "--ambiguity-usage"):
            pass
        elif opt in ('-u', "--use-all-alignments"):
            pass
        elif opt == "--strict-NA":
            pass
        elif opt in ('-x', "--extensive-mis-size"):
            pass
        elif opt == "--meta":
            pass
        elif opt == '--references-list':
            ref_txt_fpath = arg
        elif opt == '--glimmer':
            pass
        elif opt == '--no-snps':
            pass
        elif opt == '--no-check':
            pass
        elif opt == '--no-gc':
            pass
        elif opt == '--no-plots':
            pass
        elif opt == '--no-html':
            html_report = False
        elif opt == '--fast':  # --no-check, --no-gc, --no-snps will automatically set in QUAST runs
            html_report = False
        elif opt == '--plots-format':
            pass
        elif opt == '--memory-efficient':
            pass
        elif opt == '--silent':
            qconfig.silent = True
        elif opt in ('-1', '--reads1'):
            reads_fpath_f = arg
            quast_py_args = __remove_from_quast_py_args(
                quast_py_args, opt, arg)
        elif opt in ('-2', '--reads2'):
            reads_fpath_r = arg
            quast_py_args = __remove_from_quast_py_args(
                quast_py_args, opt, arg)
        elif opt == '--contig-alignment-html':
            qconfig.create_contig_alignment_html = True
        else:
            logger.error('Unknown option: %s. Use -h for help.' %
                         (opt + ' ' + arg),
                         to_stderr=True,
                         exit_with_code=2)

    for c_fpath in contigs_fpaths:
        assert_file_exists(c_fpath, 'contigs')

    labels = quast.process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    for contigs_fpath in contigs_fpaths:
        if contigs_fpath in quast_py_args:
            quast_py_args.remove(contigs_fpath)

    # Directories
    output_dirpath, _, _ = quast._set_up_output_dir(output_dirpath,
                                                    None,
                                                    make_latest_symlink,
                                                    save_json=False)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    args = [os.path.realpath(__file__)]
    for k, v in options:
        args.extend([k, v])
    args.extend(contigs_fpaths)
    logger.print_command_line(args, wrap_after=None)
    logger.start()

    qconfig.set_max_threads(logger)

    ########################################################################

    from libs import reporting
    reload(reporting)

    if os.path.isdir(corrected_dirpath):
        shutil.rmtree(corrected_dirpath)
    os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCES

    if ref_fpaths:
        logger.main_info()
        logger.main_info('Reference(s):')

        corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
            _correct_references(ref_fpaths, corrected_dirpath)

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')
    assemblies, correct_assemblies = _correct_contigs(contigs_fpaths,
                                                      output_dirpath, labels)
    if not assemblies:
        logger.error(
            "None of the assembly files contains correct contigs. "
            "Please, provide different files or decrease --min-contig threshold."
        )
        return 4

    # Running QUAST(s)
    quast_py_args += ['--meta']
    downloaded_refs = False

    # SEARCHING REFERENCES
    if not ref_fpaths:
        logger.main_info()
        if qconfig.max_references == 0:
            logger.notice(
                "Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled"
            )
        else:
            if ref_txt_fpath:
                logger.main_info(
                    "List of references was provided, starting to download reference genomes from NCBI..."
                )
            else:
                logger.main_info(
                    "No references are provided, starting to search for reference genomes in SILVA 16S rRNA database "
                    "and to download them from NCBI...")
            downloaded_dirpath = os.path.join(output_dirpath,
                                              qconfig.downloaded_dirname)
            if not os.path.isdir(downloaded_dirpath):
                os.mkdir(downloaded_dirpath)
            ref_fpaths = search_references_meta.do(assemblies, labels,
                                                   downloaded_dirpath,
                                                   ref_txt_fpath)
            if ref_fpaths:
                search_references_meta.is_quast_first_run = True
                if not ref_txt_fpath:
                    downloaded_refs = True
                logger.main_info()
                logger.main_info('Downloaded reference(s):')
                corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
                    _correct_references(ref_fpaths, corrected_dirpath)
            elif test_mode and ref_fpaths is None:
                logger.error(
                    'Failed to download or setup SILVA 16S rRNA database for working without '
                    'references on metagenome datasets!',
                    to_stderr=True,
                    exit_with_code=4)

    if not ref_fpaths:
        # No references, running regular quast with MetaGenemark gene finder
        logger.main_info()
        logger.notice(
            'No references are provided, starting regular QUAST with MetaGeneMark gene finder'
        )
        _start_quast_main(None,
                          quast_py_args,
                          assemblies=assemblies,
                          output_dirpath=output_dirpath,
                          exit_on_exception=True)
        exit(0)

    # Running combined reference
    combined_output_dirpath = os.path.join(output_dirpath,
                                           qconfig.combined_output_name)

    reads_fpaths = []
    if reads_fpath_f:
        reads_fpaths.append(reads_fpath_f)
    if reads_fpath_r:
        reads_fpaths.append(reads_fpath_r)
    if reads_fpaths:
        bed_fpath = reads_analyzer.do(combined_ref_fpath,
                                      contigs_fpaths,
                                      reads_fpaths,
                                      corrected_ref_fpaths,
                                      os.path.join(combined_output_dirpath,
                                                   qconfig.variation_dirname),
                                      external_logger=logger)
        if bed_fpath:
            quast_py_args += ['--bed-file']
            quast_py_args += [bed_fpath]

    quast_py_args += ['--combined-ref']
    run_name = 'for the combined reference'
    logger.main_info()
    logger.main_info('Starting quast.py ' + run_name + '...')
    total_num_notices = 0
    total_num_warnings = 0
    total_num_nf_errors = 0
    total_num_notifications = (total_num_notices, total_num_warnings,
                               total_num_nf_errors)
    if qconfig.html_report:
        from libs.html_saver import json_saver
        json_texts = []
    else:
        json_texts = None
    return_code, total_num_notifications, assemblies, labels = _start_quast_main(
        run_name,
        quast_py_args + ["--ambiguity-usage"] + ['all'],
        assemblies=assemblies,
        reference_fpath=combined_ref_fpath,
        output_dirpath=combined_output_dirpath,
        num_notifications_tuple=total_num_notifications,
        is_first_run=True)
    for arg in args:
        if arg in ('-s', "--scaffolds"):
            quast_py_args.remove(arg)

    if json_texts is not None:
        json_texts.append(json_saver.json_text)
    search_references_meta.is_quast_first_run = False

    genome_info_dirpath = os.path.join(output_dirpath,
                                       qconfig.combined_output_name,
                                       'genome_stats')
    genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt')
    if not os.path.exists(genome_info_fpath):
        logger.main_info('')
        logger.main_info(
            'Failed aligning the contigs for all the references. ' +
            ('Try to restart MetaQUAST with another references.'
             if not downloaded_refs else
             'Try to use option --max-ref-number to change maximum number of references '
             '(per each assembly) to download.'))
        logger.main_info('')
        quast._cleanup(corrected_dirpath)
        logger.main_info('MetaQUAST finished.')
        logger.finish_up(numbers=tuple(total_num_notifications),
                         check_test=test_mode)
        return

    if downloaded_refs:
        logger.main_info()
        logger.main_info(
            'Excluding downloaded references with low genome fraction from further analysis..'
        )
        corr_ref_fpaths = remove_unaligned_downloaded_refs(
            genome_info_fpath, ref_fpaths, chromosomes_by_refs)
        if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths:
            logger.main_info()
            logger.main_info('Filtered reference(s):')
            os.remove(combined_ref_fpath)
            contigs_analyzer.ref_labels_by_chromosomes = {}
            corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
                    _correct_references(corr_ref_fpaths, corrected_dirpath)
            run_name = 'for the corrected combined reference'
            logger.main_info()
            logger.main_info('Starting quast.py ' + run_name + '...')
            return_code, total_num_notifications, assemblies, labels = _start_quast_main(
                run_name,
                quast_py_args + ["--ambiguity-usage"] + ['all'],
                assemblies=assemblies,
                reference_fpath=combined_ref_fpath,
                output_dirpath=combined_output_dirpath,
                num_notifications_tuple=total_num_notifications,
                is_first_run=True)
            if json_texts is not None:
                json_texts = json_texts[:-1]
                json_texts.append(json_saver.json_text)
        elif corr_ref_fpaths == ref_fpaths:
            logger.main_info(
                'All downloaded references have genome fraction more than 10%. Nothing was excluded.'
            )
        else:
            logger.main_info(
                'All downloaded references have low genome fraction. Nothing was excluded for now.'
            )

    quast_py_args += ['--no-check-meta']
    qconfig.contig_thresholds = ','.join([
        str(threshold) for threshold in qconfig.contig_thresholds
        if threshold > qconfig.min_contig
    ])
    if not qconfig.contig_thresholds:
        qconfig.contig_thresholds = 'None'
    quast_py_args = __remove_from_quast_py_args(quast_py_args,
                                                '--contig-thresholds',
                                                qconfig.contig_thresholds)
    quast_py_args += ['--contig-thresholds']
    quast_py_args += [qconfig.contig_thresholds]
    quast_py_args.remove('--combined-ref')

    logger.main_info()
    logger.main_info(
        'Partitioning contigs into bins aligned to each reference..')

    assemblies_by_reference, not_aligned_assemblies = _partition_contigs(
        assemblies, corrected_ref_fpaths, corrected_dirpath,
        os.path.join(combined_output_dirpath, 'contigs_reports',
                     'alignments_%s.tsv'), labels)

    ref_names = []
    output_dirpath_per_ref = os.path.join(output_dirpath,
                                          qconfig.per_ref_dirname)
    for ref_fpath, ref_assemblies in assemblies_by_reference:
        ref_name = qutils.name_from_fpath(ref_fpath)
        logger.main_info('')
        if not ref_assemblies:
            logger.main_info('No contigs were aligned to the reference ' +
                             ref_name + ', skipping..')
        else:
            ref_names.append(ref_name)
            run_name = 'for the contigs aligned to ' + ref_name
            logger.main_info('Starting quast.py ' + run_name)

            return_code, total_num_notifications = _start_quast_main(
                run_name,
                quast_py_args,
                assemblies=ref_assemblies,
                reference_fpath=ref_fpath,
                output_dirpath=os.path.join(output_dirpath_per_ref, ref_name),
                exit_on_exception=False,
                num_notifications_tuple=total_num_notifications)
            if json_texts is not None:
                json_texts.append(json_saver.json_text)

    # Finally running for the contigs that has not been aligned to any reference
    no_unaligned_contigs = True
    for assembly in not_aligned_assemblies:
        if os.path.isfile(
                assembly.fpath) and os.stat(assembly.fpath).st_size != 0:
            no_unaligned_contigs = False
            break

    run_name = 'for the contigs not aligned anywhere'
    logger.main_info()
    if no_unaligned_contigs:
        logger.main_info('Skipping quast.py ' + run_name +
                         ' (everything is aligned!)')
    else:
        logger.main_info('Starting quast.py ' + run_name + '...')

        return_code, total_num_notifications = _start_quast_main(
            run_name,
            quast_py_args,
            assemblies=not_aligned_assemblies,
            output_dirpath=os.path.join(output_dirpath,
                                        qconfig.not_aligned_name),
            exit_on_exception=False,
            num_notifications_tuple=total_num_notifications)

        if return_code not in [0, 4]:
            logger.error(
                'Error running quast.py for the contigs not aligned anywhere')
        elif return_code == 4:  # no unaligned contigs, i.e. everything aligned
            no_unaligned_contigs = True
        if not no_unaligned_contigs:
            if json_texts is not None:
                json_texts.append(json_saver.json_text)

    if ref_names:
        logger.print_timestamp()
        logger.main_info("Summarizing results...")

        summary_output_dirpath = os.path.join(output_dirpath,
                                              qconfig.meta_summary_dir)
        if not os.path.isdir(summary_output_dirpath):
            os.makedirs(summary_output_dirpath)
        if html_report and json_texts:
            from libs.html_saver import html_saver
            html_summary_report_fpath = html_saver.init_meta_report(
                output_dirpath)
        else:
            html_summary_report_fpath = None
        from libs import create_meta_summary
        metrics_for_plots = reporting.Fields.main_metrics
        misassembl_metrics = [
            reporting.Fields.MIS_RELOCATION,
            reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION,
            reporting.Fields.MIS_ISTRANSLOCATIONS
        ]
        create_meta_summary.do(
            html_summary_report_fpath, summary_output_dirpath,
            combined_output_dirpath, output_dirpath_per_ref, metrics_for_plots,
            misassembl_metrics,
            ref_names if no_unaligned_contigs else ref_names +
            [qconfig.not_aligned_name])
        if html_report and json_texts:
            from libs import plotter
            html_saver.save_colors(output_dirpath,
                                   contigs_fpaths,
                                   plotter.dict_color_and_ls,
                                   meta=True)
            html_saver.create_meta_report(output_dirpath, json_texts)

    quast._cleanup(corrected_dirpath)
    logger.main_info('')
    logger.main_info('MetaQUAST finished.')
    logger.finish_up(numbers=tuple(total_num_notifications),
                     check_test=test_mode)
Esempio n. 4
0
def main(args):
    if ' ' in qconfig.QUAST_HOME:
        logger.error('QUAST does not support spaces in paths. \n'
                     'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n'
                     'Please, put QUAST in a different directory, then try again.\n',
                     to_stderr=True,
                     exit_with_code=3)

    if not args:
        qconfig.usage(meta=True)
        sys.exit(0)

    genes = []
    operons = []
    html_report = qconfig.html_report
    make_latest_symlink = True
    ref_txt_fpath = None

    try:
        options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options)
    except getopt.GetoptError:
        _, exc_value, _ = sys.exc_info()
        print >> sys.stderr, exc_value
        print >> sys.stderr
        qconfig.usage(meta=True)
        sys.exit(2)

    quast_py_args = args[:]
    test_mode = False

    for opt, arg in options:
        if opt in ('-d', '--debug'):
            options.remove((opt, arg))
            qconfig.debug = True
            logger.set_up_console_handler(debug=True)

        elif opt == '--test' or opt == '--test-no-ref':
            options.remove((opt, arg))
            quast_py_args = __remove_from_quast_py_args(quast_py_args, opt)
            options += [('-o', 'quast_test_output')]
            if opt == '--test':
                options += [('-R', ','.join([os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_ref_1.fasta'),
                            os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_ref_2.fasta'),
                            os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_ref_3.fasta')]))]
            contigs_fpaths += [os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_contigs_1.fasta'),
                               os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_contigs_2.fasta')]
            test_mode = True

        elif opt.startswith('--help') or opt == '-h':
            qconfig.usage(opt == "--help-hidden", meta=True, short=False)
            sys.exit(0)

        elif opt.startswith('--version') or opt == '-v':
            qconfig.print_version(meta=True)
            sys.exit(0)

    if not contigs_fpaths:
        logger.error("You should specify at least one file with contigs!\n")
        qconfig.usage(meta=True)
        sys.exit(2)

    ref_fpaths = []
    combined_ref_fpath = ''
    reads_fpath_f = ''
    reads_fpath_r = ''
    output_dirpath = None

    labels = None
    all_labels_from_dirs = False

    for opt, arg in options:
        if opt in ('-o', "--output-dir"):
            # Removing output dir arg in order to further
            # construct other quast calls from this options
            if opt in quast_py_args and arg in quast_py_args:
                quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg)

            output_dirpath = os.path.abspath(arg)
            make_latest_symlink = False

        elif opt in ('-G', "--genes"):
            assert_file_exists(arg, 'genes')
            genes += arg

        elif opt in ('-O', "--operons"):
            assert_file_exists(arg, 'operons')
            operons += arg

        elif opt in ('-R', "--reference"):
            # Removing reference args in order to further
            # construct quast calls from this args with other reference options
            if opt in quast_py_args and arg in quast_py_args:
                quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg)
            if os.path.isdir(arg):
                ref_fpaths = [os.path.join(path,file) for (path, dirs, files) in os.walk(arg) for file in files if qutils.check_is_fasta_file(file)]
                ref_fpaths.sort()
            else:
                ref_fpaths = arg.split(',')
                for i, ref_fpath in enumerate(ref_fpaths):
                    assert_file_exists(ref_fpath, 'reference')
                    ref_fpaths[i] = ref_fpath

        elif opt == '--max-ref-number':
            quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg)
            qconfig.max_references = int(arg)
            if qconfig.max_references < 0:
                qconfig.max_references = 0

        elif opt in ('-m', "--min-contig"):
            qconfig.min_contig = int(arg)

        elif opt in ('-t', "--threads"):
            qconfig.max_threads = int(arg)
            if qconfig.max_threads < 1:
                qconfig.max_threads = 1

        elif opt in ('-l', '--labels'):
            quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg)
            labels = quast.parse_labels(arg, contigs_fpaths)

        elif opt == '-L':
            quast_py_args = __remove_from_quast_py_args(quast_py_args, opt)
            all_labels_from_dirs = True

        elif opt in ('-j', '--save-json'):
            pass
        elif opt in ('-J', '--save-json-to'):
            pass
        elif opt == "--contig-thresholds":
            pass
        elif opt in ('-c', "--mincluster"):
            pass
        elif opt == "--est-ref-size":
            pass
        elif opt == "--gene-thresholds":
            pass
        elif opt in ('-s', "--scaffolds"):
            pass
        elif opt == "--gage":
            pass
        elif opt == "--debug":
            pass
        elif opt in ('-e', "--eukaryote"):
            pass
        elif opt in ('-f', "--gene-finding"):
            pass
        elif opt in ('-i', "--min-alignment"):
            pass
        elif opt in ('-c', "--min-cluster"):
            pass
        elif opt in ('-a', "--ambiguity-usage"):
            pass
        elif opt in ('-u', "--use-all-alignments"):
            pass
        elif opt == "--strict-NA":
            pass
        elif opt in ('-x', "--extensive-mis-size"):
            pass
        elif opt == "--meta":
            pass
        elif opt == '--references-list':
            ref_txt_fpath = arg
        elif opt == '--glimmer':
            pass
        elif opt == '--no-snps':
            pass
        elif opt == '--no-check':
            pass
        elif opt == '--no-gc':
            pass
        elif opt == '--no-plots':
            pass
        elif opt == '--no-html':
            html_report = False
        elif opt == '--fast':  # --no-check, --no-gc, --no-snps will automatically set in QUAST runs
            html_report = False
        elif opt == '--plots-format':
            pass
        elif opt == '--memory-efficient':
            pass
        elif opt == '--silent':
            qconfig.silent = True
        elif opt in ('-1', '--reads1'):
            reads_fpath_f = arg
            quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg)
        elif opt in ('-2', '--reads2'):
            reads_fpath_r = arg
            quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg)
        elif opt == '--contig-alignment-html':
            qconfig.create_contig_alignment_html = True
        else:
            logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2)

    for c_fpath in contigs_fpaths:
        assert_file_exists(c_fpath, 'contigs')

    labels = quast.process_labels(contigs_fpaths, labels, all_labels_from_dirs)

    for contigs_fpath in contigs_fpaths:
        if contigs_fpath in quast_py_args:
            quast_py_args.remove(contigs_fpath)

    # Directories
    output_dirpath, _, _ = quast._set_up_output_dir(
        output_dirpath, None, make_latest_symlink,
        save_json=False)

    corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname)

    logger.set_up_file_handler(output_dirpath)
    args = [os.path.realpath(__file__)]
    for k, v in options: args.extend([k, v])
    args.extend(contigs_fpaths)
    logger.print_command_line(args, wrap_after=None)
    logger.start()

    qconfig.set_max_threads(logger)

    ########################################################################

    from libs import reporting
    reload(reporting)

    if os.path.isdir(corrected_dirpath):
        shutil.rmtree(corrected_dirpath)
    os.mkdir(corrected_dirpath)

    # PROCESSING REFERENCES

    if ref_fpaths:
        logger.main_info()
        logger.main_info('Reference(s):')

        corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
            _correct_references(ref_fpaths, corrected_dirpath)

    # PROCESSING CONTIGS
    logger.main_info()
    logger.main_info('Contigs:')
    assemblies, correct_assemblies = _correct_contigs(contigs_fpaths, output_dirpath, labels)
    if not assemblies:
        logger.error("None of the assembly files contains correct contigs. "
                     "Please, provide different files or decrease --min-contig threshold.")
        return 4

    # Running QUAST(s)
    quast_py_args += ['--meta']
    downloaded_refs = False

    # SEARCHING REFERENCES
    if not ref_fpaths:
        logger.main_info()
        if qconfig.max_references == 0:
            logger.notice("Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled")
        else:
            if ref_txt_fpath:
                logger.main_info("List of references was provided, starting to download reference genomes from NCBI...")
            else:
                logger.main_info("No references are provided, starting to search for reference genomes in SILVA 16S rRNA database "
                        "and to download them from NCBI...")
            downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname)
            if not os.path.isdir(downloaded_dirpath):
                os.mkdir(downloaded_dirpath)
            ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, ref_txt_fpath)
            if ref_fpaths:
                search_references_meta.is_quast_first_run = True
                if not ref_txt_fpath:
                    downloaded_refs = True
                logger.main_info()
                logger.main_info('Downloaded reference(s):')
                corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
                    _correct_references(ref_fpaths, corrected_dirpath)
            elif test_mode and ref_fpaths is None:
                logger.error('Failed to download or setup SILVA 16S rRNA database for working without '
                             'references on metagenome datasets!', to_stderr=True, exit_with_code=4)

    if not ref_fpaths:
        # No references, running regular quast with MetaGenemark gene finder
        logger.main_info()
        logger.notice('No references are provided, starting regular QUAST with MetaGeneMark gene finder')
        _start_quast_main(
            None,
            quast_py_args,
            assemblies=assemblies,
            output_dirpath=output_dirpath,
            exit_on_exception=True)
        exit(0)

    # Running combined reference
    combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name)

    reads_fpaths = []
    if reads_fpath_f:
        reads_fpaths.append(reads_fpath_f)
    if reads_fpath_r:
        reads_fpaths.append(reads_fpath_r)
    if reads_fpaths:
        bed_fpath = reads_analyzer.do(combined_ref_fpath, contigs_fpaths, reads_fpaths, corrected_ref_fpaths,
                                      os.path.join(combined_output_dirpath, qconfig.variation_dirname),
                                      external_logger=logger)
        if bed_fpath:
            quast_py_args += ['--bed-file']
            quast_py_args += [bed_fpath]

    quast_py_args += ['--combined-ref']
    run_name = 'for the combined reference'
    logger.main_info()
    logger.main_info('Starting quast.py ' + run_name + '...')
    total_num_notices = 0
    total_num_warnings = 0
    total_num_nf_errors = 0
    total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors)
    if qconfig.html_report:
        from libs.html_saver import json_saver
        json_texts = []
    else:
        json_texts = None
    return_code, total_num_notifications, assemblies, labels = _start_quast_main(run_name, quast_py_args + ["--ambiguity-usage"] + ['all'],
        assemblies=assemblies,
        reference_fpath=combined_ref_fpath,
        output_dirpath=combined_output_dirpath,
        num_notifications_tuple=total_num_notifications, is_first_run=True)
    for arg in args:
        if arg in ('-s', "--scaffolds"):
            quast_py_args.remove(arg)

    if json_texts is not None:
        json_texts.append(json_saver.json_text)
    search_references_meta.is_quast_first_run = False

    genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats')
    genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt')
    if not os.path.exists(genome_info_fpath):
        logger.main_info('')
        logger.main_info('Failed aligning the contigs for all the references. ' + ('Try to restart MetaQUAST with another references.'
                                                        if not downloaded_refs else 'Try to use option --max-ref-number to change maximum number of references '
                                                                                    '(per each assembly) to download.'))
        logger.main_info('')
        quast._cleanup(corrected_dirpath)
        logger.main_info('MetaQUAST finished.')
        logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)
        return

    if downloaded_refs:
        logger.main_info()
        logger.main_info('Excluding downloaded references with low genome fraction from further analysis..')
        corr_ref_fpaths = remove_unaligned_downloaded_refs(genome_info_fpath, ref_fpaths, chromosomes_by_refs)
        if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths:
            logger.main_info()
            logger.main_info('Filtered reference(s):')
            os.remove(combined_ref_fpath)
            contigs_analyzer.ref_labels_by_chromosomes = {}
            corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\
                    _correct_references(corr_ref_fpaths, corrected_dirpath)
            run_name = 'for the corrected combined reference'
            logger.main_info()
            logger.main_info('Starting quast.py ' + run_name + '...')
            return_code, total_num_notifications, assemblies, labels = _start_quast_main(run_name, quast_py_args + ["--ambiguity-usage"] + ['all'],
                assemblies=assemblies,
                reference_fpath=combined_ref_fpath,
                output_dirpath=combined_output_dirpath,
                num_notifications_tuple=total_num_notifications, is_first_run=True)
            if json_texts is not None:
                json_texts = json_texts[:-1]
                json_texts.append(json_saver.json_text)
        elif corr_ref_fpaths == ref_fpaths:
            logger.main_info('All downloaded references have genome fraction more than 10%. Nothing was excluded.')
        else:
            logger.main_info('All downloaded references have low genome fraction. Nothing was excluded for now.')

    quast_py_args += ['--no-check-meta']
    qconfig.contig_thresholds = ','.join([str(threshold) for threshold in qconfig.contig_thresholds if threshold > qconfig.min_contig])
    if not qconfig.contig_thresholds:
        qconfig.contig_thresholds = 'None'
    quast_py_args = __remove_from_quast_py_args(quast_py_args, '--contig-thresholds', qconfig.contig_thresholds)
    quast_py_args += ['--contig-thresholds']
    quast_py_args += [qconfig.contig_thresholds]
    quast_py_args.remove('--combined-ref')

    logger.main_info()
    logger.main_info('Partitioning contigs into bins aligned to each reference..')

    assemblies_by_reference, not_aligned_assemblies = _partition_contigs(
        assemblies, corrected_ref_fpaths, corrected_dirpath,
        os.path.join(combined_output_dirpath, 'contigs_reports', 'alignments_%s.tsv'), labels)

    ref_names = []
    output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname)
    for ref_fpath, ref_assemblies in assemblies_by_reference:
        ref_name = qutils.name_from_fpath(ref_fpath)
        logger.main_info('')
        if not ref_assemblies:
            logger.main_info('No contigs were aligned to the reference ' + ref_name + ', skipping..')
        else:
            ref_names.append(ref_name)
            run_name = 'for the contigs aligned to ' + ref_name
            logger.main_info('Starting quast.py ' + run_name)

            return_code, total_num_notifications = _start_quast_main(run_name, quast_py_args,
                assemblies=ref_assemblies,
                reference_fpath=ref_fpath,
                output_dirpath=os.path.join(output_dirpath_per_ref, ref_name),
                exit_on_exception=False, num_notifications_tuple=total_num_notifications)
            if json_texts is not None:
                json_texts.append(json_saver.json_text)

    # Finally running for the contigs that has not been aligned to any reference
    no_unaligned_contigs = True
    for assembly in not_aligned_assemblies:
        if os.path.isfile(assembly.fpath) and os.stat(assembly.fpath).st_size != 0:
            no_unaligned_contigs = False
            break

    run_name = 'for the contigs not aligned anywhere'
    logger.main_info()
    if no_unaligned_contigs:
        logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)')
    else:
        logger.main_info('Starting quast.py ' + run_name + '...')

        return_code, total_num_notifications = _start_quast_main(run_name, quast_py_args,
            assemblies=not_aligned_assemblies,
            output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name),
            exit_on_exception=False, num_notifications_tuple=total_num_notifications)

        if return_code not in [0, 4]:
            logger.error('Error running quast.py for the contigs not aligned anywhere')
        elif return_code == 4:  # no unaligned contigs, i.e. everything aligned
            no_unaligned_contigs = True
        if not no_unaligned_contigs:
            if json_texts is not None:
                json_texts.append(json_saver.json_text)

    if ref_names:
        logger.print_timestamp()
        logger.main_info("Summarizing results...")

        summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir)
        if not os.path.isdir(summary_output_dirpath):
            os.makedirs(summary_output_dirpath)
        if html_report and json_texts:
            from libs.html_saver import html_saver
            html_summary_report_fpath = html_saver.init_meta_report(output_dirpath)
        else:
            html_summary_report_fpath = None
        from libs import create_meta_summary
        metrics_for_plots = reporting.Fields.main_metrics
        misassembl_metrics = [reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION,
                           reporting.Fields.MIS_ISTRANSLOCATIONS]
        create_meta_summary.do(html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics_for_plots, misassembl_metrics,
                               ref_names if no_unaligned_contigs else ref_names + [qconfig.not_aligned_name])
        if html_report and json_texts:
            from libs import plotter
            html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls, meta=True)
            html_saver.create_meta_report(output_dirpath, json_texts)

    quast._cleanup(corrected_dirpath)
    logger.main_info('')
    logger.main_info('MetaQUAST finished.')
    logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)