def calculate_anim(infiles, org_lengths):
    """Returns ANIm result dataframes for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        joblist = anim.generate_nucmer_jobs(
            infiles, args.outdirname, nucmer_exe=args.nucmer_exe, maxmatch=args.maxmatch
        )
        if args.scheduler == "multiprocessing":
            logger.info("Running jobs with multiprocessing")
            cumval = run_mp.run_dependency_graph(joblist, verbose=args.verbose, logger=logger)
            logger.info("Cumulative return value: %d" % cumval)
            if 0 < cumval:
                logger.warning("At least one NUCmer comparison failed. " + "ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            run_sge.run_dependency_graph(joblist, verbose=args.verbose, logger=logger)
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    try:
        data = anim.process_deltadir(args.outdirname, org_lengths)
    except ZeroDivisionError:
        logger.error("One or more NUCmer output files has a problem.")
        if not args.skip_nucmer:
            if 0 < cumval:
                logger.error("This is possibly due to NUCmer run failure, " + "please investigate")
            else:
                logger.error(
                    "This is possibly due to a NUCmer comparison "
                    + "being too distant for use. Please consider "
                    + "using the --maxmatch option."
                )
        logger.error(last_exception())
    return data
Example #2
0
    def test_nucmer_job_generation(self):
        """generate dependency tree of NUCmer/delta-filter jobs.

        Tests that the correct dependency graph and naming scheme is produced.
        """
        joblist = anim.generate_nucmer_jobs(self.files,
                                            jobprefix="test")
        assert_equal(len(joblist), 6)
        for idx, job in enumerate(joblist):
            assert_equal(job.name, "test_%06d-f" % idx)  # filter job name
            assert_equal(len(job.dependencies), 1)       # has NUCmer job
            assert_equal(job.dependencies[0].name,
                         "test_%06d-n" % idx)            # NUCmer job name
def calculate_anim(infiles, org_lengths):
    """Returns ANIm result dataframes for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        joblist = anim.generate_nucmer_jobs(infiles,
                                            args.outdirname,
                                            nucmer_exe=args.nucmer_exe,
                                            maxmatch=args.maxmatch)
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            cumval = run_mp.run_dependency_graph(joblist,
                                                 verbose=args.verbose,
                                                 logger=logger)
            logger.info("Cumulative return value: %d" % cumval)
            if 0 < cumval:
                logger.warning("At least one NUCmer comparison failed. " +
                               "ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            run_sge.run_dependency_graph(joblist,
                                         verbose=args.verbose,
                                         logger=logger)
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    try:
        data = anim.process_deltadir(args.outdirname, org_lengths)
    except ZeroDivisionError:
        logger.error("One or more NUCmer output files has a problem.")
        if not args.skip_nucmer:
            if 0 < cumval:
                logger.error("This is possibly due to NUCmer run failure, " +
                             "please investigate")
            else:
                logger.error("This is possibly due to a NUCmer comparison " +
                             "being too distant for use. Please consider " +
                             "using the --maxmatch option.")
        logger.error(last_exception())
    return data
Example #4
0
def calculate_anim(infiles, org_lengths):
    """Returns ANIm result dataframes for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    deltadir = os.path.join(args.outdirname, ALIGNDIR['ANIm'])
    logger.info("Writing nucmer output to %s", deltadir)
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        joblist = anim.generate_nucmer_jobs(infiles,
                                            args.outdirname,
                                            nucmer_exe=args.nucmer_exe,
                                            filter_exe=args.filter_exe,
                                            maxmatch=args.maxmatch,
                                            jobprefix=args.jobprefix)
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            if args.workers is None:
                logger.info("(using maximum number of available " +
                            "worker threads)")
            else:
                logger.info("(using %d worker threads, if available)",
                            args.workers)
            cumval = run_mp.run_dependency_graph(joblist,
                                                 workers=args.workers,
                                                 logger=logger)
            logger.info("Cumulative return value: %d", cumval)
            if 0 < cumval:
                logger.warning("At least one NUCmer comparison failed. " +
                               "ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            logger.info("Jobarray group size set to %d", args.sgegroupsize)
            run_sge.run_dependency_graph(joblist,
                                         logger=logger,
                                         jgprefix=args.jobprefix,
                                         sgegroupsize=args.sgegroupsize,
                                         sgeargs=args.sgeargs)
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    results = anim.process_deltadir(deltadir, org_lengths, logger=logger)
    if results.zero_error:  # zero percentage identity error
        if not args.skip_nucmer and args.scheduler == 'multiprocessing':
            if 0 < cumval:
                logger.error("This has possibly been a NUCmer run failure, " +
                             "please investigate")
                logger.error(last_exception())
                sys.exit(1)
            else:
                logger.error("This is possibly due to a NUCmer comparison " +
                             "being too distant for use. Please consider " +
                             "using the --maxmatch option.")
                logger.error("This is alternatively due to NUCmer run " +
                             "failure, analysis will continue, but please " +
                             "investigate.")
    if not args.nocompress:
        logger.info("Compressing/deleting %s", deltadir)
        compress_delete_outdir(deltadir)

    # Return processed data from .delta files
    return results
def calculate_anim(args: Namespace, logger: Logger, infiles: List[Path],
                   org_lengths: Dict) -> pyani_tools.ANIResults:
    """Return ANIm result dataframes for files in input directory.

    :param args:  Namespace, command-line arguments
    :param logger: logging object
    :param infiles:  list of paths to each input file
    :param org_lengths:  dict, input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    deltadir = args.outdirname / ALIGNDIR["ANIm"]
    logger.info("Writing nucmer output to %s", deltadir)
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        joblist = anim.generate_nucmer_jobs(
            infiles,
            args.outdirname,
            nucmer_exe=args.nucmer_exe,
            filter_exe=args.filter_exe,
            maxmatch=args.maxmatch,
            jobprefix=args.jobprefix,
        )
        if args.scheduler == "multiprocessing":
            logger.info("Running jobs with multiprocessing")
            if args.workers is None:
                logger.info(
                    "(using maximum number of available worker threads)")
            else:
                logger.info("(using %d worker threads, if available)",
                            args.workers)
            cumval = run_mp.run_dependency_graph(joblist,
                                                 workers=args.workers,
                                                 logger=logger)
            logger.info("Cumulative return value: %d", cumval)
            if cumval > 0:
                logger.warning(
                    "At least one NUCmer comparison failed. ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            logger.info("Jobarray group size set to %d", args.sgegroupsize)
            run_sge.run_dependency_graph(
                joblist,
                logger=logger,
                jgprefix=args.jobprefix,
                sgegroupsize=args.sgegroupsize,
                sgeargs=args.sgeargs,
            )
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    results = anim.process_deltadir(deltadir, org_lengths, logger=logger)
    if results.zero_error:  # zero percentage identity error
        if not args.skip_nucmer and args.scheduler == "multiprocessing":
            if cumval > 0:
                logger.error(
                    "This has possibly been a NUCmer run failure, please investigate",
                    exc_info=True,
                )
                raise SystemExit(1)
            logger.error(
                "This is possibly due to:\n\t(i) a NUCmer comparison being too distant "
                "for use (please consider using the --maxmatch option)\n\t(ii) NUCmer run "
                "failure (analysis will continue, but please investigate)")
    if not args.nocompress:
        logger.info("Compressing/deleting %s", deltadir)
        compress_delete_outdir(deltadir, logger)

    # Return processed data from .delta files
    return results
def calculate_anim(infiles, org_lengths):
    """Returns ANIm result dataframes for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    deltadir = os.path.join(args.outdirname, ALIGNDIR['ANIm'])
    logger.info("Writing nucmer output to %s" % deltadir)
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        joblist = anim.generate_nucmer_jobs(infiles, args.outdirname,
                                            nucmer_exe=args.nucmer_exe,
                                            maxmatch=args.maxmatch,
                                            jobprefix=args.jobprefix)
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            if args.workers is None:
                logger.info("(using maximum number of available " +
                            "worker threads)")
            else:
                logger.info("(using %d worker threads, if available)" %
                            args.workers)
            cumval = run_mp.run_dependency_graph(joblist,
                                                 workers=args.workers, 
                                                 verbose=args.verbose,
                                                 logger=logger)
            logger.info("Cumulative return value: %d" % cumval)
            if 0 < cumval:
                logger.warning("At least one NUCmer comparison failed. " +
                               "ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            logger.info("Jobarray group size set to %d" % args.sgegroupsize)
            run_sge.run_dependency_graph(joblist, verbose=args.verbose,
                                         logger=logger,
                                         jgprefix=args.jobprefix,
                                         sgegroupsize=args.sgegroupsize)
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    data = anim.process_deltadir(deltadir, org_lengths, logger=logger)
    if data[-1]:  # zero percentage identity error
        if not args.skip_nucmer and args.scheduler == 'multiprocessing':
            if 0 < cumval:
                logger.error("This has possibly been a NUCmer run failure, " +
                             "please investigate")
                logger.error(last_exception())
                sys.exit(1)
            else:
                logger.error("This is possibly due to a NUCmer comparison " +
                             "being too distant for use. Please consider " +
                             "using the --maxmatch option.")
                logger.error("This is alternatively due to NUCmer run " +
                             "failure, analysis will continue, but please " +
                             "investigate.")
    if not args.nocompress:
        logger.info("Compressing/deleting %s" % deltadir)
        compress_delete_outdir(deltadir)

    # Return processed data from .delta files
    return tuple(data[:-1])