Esempio n. 1
0
def test_aniblastall_concordance():
    """Test concordance of ANIblastall method with JSpecies output."""
    # Make/check output directory
    mode = "ANIblastall"
    outdirname = delete_and_remake_outdir(mode)

    # Get dataframes of JSpecies output
    aniblastall_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test ANIblastall concordance:
    # Make fragments
    fragfiles, fraglengths = anib.fragment_fasta_files(infiles, outdirname,
                                                       pyani_config.FRAGSIZE)

    # Build jobgraph
    jobgraph = anib.make_job_graph(
        infiles, fragfiles,
        anib.make_blastcmd_builder("ANIblastall", outdirname))
    print("\nJobgraph:\n", jobgraph)
    print("\nJob 0:\n", jobgraph[0].script)

    # Run jobgraph with multiprocessing
    run_dependency_graph(jobgraph)
    print("Ran multiprocessing jobs")

    # Process BLAST; the pid data is in anib_data[1]
    aniblastall_data = anib.process_blast(outdirname,
                                          org_lengths,
                                          fraglengths,
                                          mode="ANIblastall")
    aniblastall_pid = \
        aniblastall_data.percentage_identity.sort_index(axis=0).\
        sort_index(axis=1) * 100.

    index, columns = aniblastall_pid.index, aniblastall_pid.columns
    diffmat = aniblastall_pid.as_matrix() - aniblastall_jspecies.as_matrix()
    aniblastall_diff = pd.DataFrame(diffmat, index=index, columns=columns)

    # Write dataframes to file, for reference
    aniblastall_pid.to_csv(os.path.join(outdirname, 'ANIblastall_pid.tab'),
                           sep='\t')
    aniblastall_jspecies.to_csv(os.path.join(outdirname,
                                             'ANIblastall_jspecies.tab'),
                                sep='\t')
    aniblastall_diff.to_csv(os.path.join(outdirname, 'ANIblastall_diff.tab'),
                            sep='\t')
    print("ANIblastall concordance test output placed in %s" % outdirname)
    print("ANIblastall PID:\n", aniblastall_pid)
    print("ANIblastall JSpecies:\n", aniblastall_jspecies)
    print("ANIblastall diff:\n", aniblastall_diff)

    # We'd like the absolute difference reported to be < ANIBLASTALL_THRESHOLD
    max_diff = aniblastall_diff.abs().values.max()
    print("Maximum difference for ANIblastall: %e" % max_diff)
    assert_less(max_diff, ANIB_THRESHOLD)
Esempio n. 2
0
def run_anim_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
    """Pass ANIm nucmer jobs to the scheduler.

    :param joblist:           list of ComparisonJob namedtuples
    :param args:              command-line arguments for the run
    """
    logger = logging.getLogger(__name__)

    if args.scheduler == "multiprocessing":
        logger.info("Running jobs with multiprocessing")
        if not args.workers:
            logger.debug("(using maximum number of worker threads)")
        else:
            logger.debug("(using %d worker threads, if available)", args.workers)
        cumval = run_mp.run_dependency_graph(
            [_.job for _ in joblist], workers=args.workers
        )
        if cumval > 0:
            logger.error(
                "At least one NUCmer comparison failed. Please investigate (exiting)"
            )
            raise PyaniException("Multiprocessing run failed in ANIm")
        logger.info("Multiprocessing run completed without error")
    else:
        logger.info("Running jobs with SGE")
        logger.debug("Setting jobarray group size to %d", args.sgegroupsize)
        run_sge.run_dependency_graph(
            [_.job for _ in joblist],
            jgprefix=args.jobprefix,
            sgegroupsize=args.sgegroupsize,
            sgeargs=args.sgeargs,
        )
Esempio n. 3
0
def test_aniblastall_concordance(
    paths_concordance_fna,
    path_concordance_jspecies,
    tolerance_anib_hi,
    fragment_length,
    tmp_path,
):
    """Check ANIblastall results are concordant with JSpecies."""
    # Get lengths of input genomes
    orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna)

    # Perform ANIblastall on the input directory contents
    fragfiles, fraglengths = anib.fragment_fasta_files(
        paths_concordance_fna, tmp_path, fragment_length
    )
    jobgraph = anib.make_job_graph(
        paths_concordance_fna,
        fragfiles,
        anib.make_blastcmd_builder("ANIblastall", tmp_path),
    )
    assert 0 == run_mp.run_dependency_graph(jobgraph)  # Jobs must run correctly

    # Process BLAST output
    result_pid = anib.process_blast(
        tmp_path, orglengths, fraglengths, mode="ANIblastall"
    ).percentage_identity

    # Compare JSpecies output to results
    result_pid = (result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0).values
    tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIb"].values
    assert result_pid - tgt_pid == pytest.approx(0, abs=tolerance_anib_hi)
Esempio n. 4
0
    def test_aniblastall_concordance(self):
        """ANIblastall results concordant with JSpecies."""
        # Perform ANIblastall on the input directory contents
        outdir = os.path.join(self.outdir, "blastall")
        os.makedirs(outdir, exist_ok=True)
        fragfiles, fraglengths = anib.fragment_fasta_files(
            self.infiles, outdir, self.fragsize
        )
        jobgraph = anib.make_job_graph(
            self.infiles, fragfiles, anib.make_blastcmd_builder("ANIblastall", outdir)
        )
        assert_equal(0, run_mp.run_dependency_graph(jobgraph))
        results = anib.process_blast(
            outdir, self.orglengths, fraglengths, mode="ANIblastall"
        )
        result_pid = results.percentage_identity
        result_pid.to_csv(os.path.join(self.outdir, "pyani_aniblastall.tab"), sep="\t")

        # Compare JSpecies output to results
        result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0
        diffmat = result_pid.values - self.target["ANIb"].values
        aniblastall_diff = pd.DataFrame(
            diffmat, index=result_pid.index, columns=result_pid.columns
        )
        aniblastall_diff.to_csv(
            os.path.join(self.outdir, "pyani_aniblastall_diff.tab"), sep="\t"
        )
        assert_less(aniblastall_diff.abs().values.max(), self.tolerance["ANIblastall"])
Esempio n. 5
0
 def test_dependency_graph_run(self):
     """Test that module runs dependency graph."""
     fragresult = anib.fragment_fasta_files(self.infiles, self.outdir, self.fraglen)
     blastcmds = anib.make_blastcmd_builder("ANIb", self.outdir)
     jobgraph = anib.make_job_graph(self.infiles, fragresult[0], blastcmds)
     result = run_multiprocessing.run_dependency_graph(jobgraph)
     self.assertEqual(0, result)
Esempio n. 6
0
 def test_dependency_graph_run(self):
     """module runs dependency graph."""
     fragresult = anib.fragment_fasta_files(self.infiles, self.outdir, self.fraglen)
     blastcmds = anib.make_blastcmd_builder("ANIb", self.outdir)
     jobgraph = anib.make_job_graph(self.infiles, fragresult[0], blastcmds)
     result = run_multiprocessing.run_dependency_graph(jobgraph)
     assert_equal(0, result)
Esempio n. 7
0
def test_dependency_graph_run(path_fna_two, fragment_length, tmp_path):
    """Test that module runs dependency graph."""
    fragresult = fragment_fasta_files(path_fna_two, tmp_path, fragment_length)
    blastcmds = make_blastcmd_builder("ANIb", tmp_path)
    jobgraph = make_job_graph(path_fna_two, fragresult[0], blastcmds)
    result = run_dependency_graph(jobgraph)
    assert 0 == result
def calculate_anim(infiles, org_lengths):
    """Returns ANIm result dataframes for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        joblist = anim.generate_nucmer_jobs(
            infiles, args.outdirname, nucmer_exe=args.nucmer_exe, maxmatch=args.maxmatch
        )
        if args.scheduler == "multiprocessing":
            logger.info("Running jobs with multiprocessing")
            cumval = run_mp.run_dependency_graph(joblist, verbose=args.verbose, logger=logger)
            logger.info("Cumulative return value: %d" % cumval)
            if 0 < cumval:
                logger.warning("At least one NUCmer comparison failed. " + "ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            run_sge.run_dependency_graph(joblist, verbose=args.verbose, logger=logger)
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    try:
        data = anim.process_deltadir(args.outdirname, org_lengths)
    except ZeroDivisionError:
        logger.error("One or more NUCmer output files has a problem.")
        if not args.skip_nucmer:
            if 0 < cumval:
                logger.error("This is possibly due to NUCmer run failure, " + "please investigate")
            else:
                logger.error(
                    "This is possibly due to a NUCmer comparison "
                    + "being too distant for use. Please consider "
                    + "using the --maxmatch option."
                )
        logger.error(last_exception())
    return data
Esempio n. 9
0
    def test_aniblastall_concordance(self):
        """Check ANIblastall results are concordant with JSpecies."""
        # Perform ANIblastall on the input directory contents
        outdir = self.outdir / "blastall"
        outdir.mkdir(exist_ok=True)
        fragfiles, fraglengths = anib.fragment_fasta_files(
            self.infiles, outdir, self.fragsize)
        jobgraph = anib.make_job_graph(
            self.infiles, fragfiles,
            anib.make_blastcmd_builder("ANIblastall", outdir))
        self.assertEqual(0, run_mp.run_dependency_graph(jobgraph))
        results = anib.process_blast(outdir,
                                     self.orglengths,
                                     fraglengths,
                                     mode="ANIblastall")
        result_pid = results.percentage_identity
        result_pid.to_csv(self.outdir / "pyani_aniblastall.tab", sep="\t")

        # Compare JSpecies output to results
        result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0
        diffmat = result_pid.values - self.target["ANIb"].values
        aniblastall_diff = pd.DataFrame(diffmat,
                                        index=result_pid.index,
                                        columns=result_pid.columns)
        aniblastall_diff.to_csv(self.outdir / "pyani_aniblastall_diff.tab",
                                sep="\t")
        self.assertLess(aniblastall_diff.abs().values.max(),
                        self.tolerance["ANIblastall"])
Esempio n. 10
0
def unified_anib(indirname,User_ID):
    # Build BLAST databases and run pairwise BLASTN
    # Fraglengths does not get reused with BLASTN
    os.mkdir(indirname+'{0}_out/'.format(User_ID))
    os.system("chmod 777 {0}".format(indirname+'{0}_out'.format(User_ID)))
    logging.basicConfig(level=logging.DEBUG, filename="/home/linproject/Workspace/LIN_log/logfile_{0}".format(User_ID),
                        filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s")
    infiles = pyani_files.get_fasta_files(indirname)
    org_lengths = pyani_files.get_sequence_lengths(infiles)
    fragsize = pyani_config.FRAGSIZE
    filestems = pyani_config.ANIB_FILESTEMS
    filenames = os.listdir(indirname)
    for fname in filenames:
        if ' ' in  os.path.abspath(fname):
            logging.error("File or directory '%s' contains whitespace" % fname)
            logging.error("This will cause issues with MUMmer and BLAST")
            logging.error("(exiting)")
            sys.exit(1)
    fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, indirname+'{0}_out/'.format(User_ID), fragsize)
    # Export fragment lengths as JSON, in case we re-run BLASTALL with
    # --skip_blastn
    with open(os.path.join(indirname+'{0}_out/'.format(User_ID), 'fraglengths.json'), 'w') as outfile:
        json.dump(fraglengths, outfile)
    # Which executables are we using?
    format_exe = pyani_config.FORMATDB_DEFAULT
    blast_exe = pyani_config.BLASTALL_DEFAULT
    # Run BLAST database-building and executables from a jobgraph
    logging.info("Creating job dependency graph")
    jobgraph = anib.make_job_graph(infiles, fragfiles, indirname+'{0}_out/'.format(User_ID), format_exe, blast_exe, 'ANIblastall')

    logging.info("Running jobs with multiprocessing")
    logging.info("Running job dependency graph")
    cumval = run_mp.run_dependency_graph(jobgraph, verbose=False,
                                         logger=logging)
    if 0 < cumval:
        logging.warning("At least one BLAST run failed. " +
                       "%s may fail." % 'ANIblastall')
    else:
        logging.info("All multiprocessing jobs complete.")

    # Process pairwise BLASTN output
    logging.info("Processing pairwise %s BLAST output." % 'ANIblastall')
    try:
        data = anib.process_blast(indirname+'{0}_out/'.format(User_ID), org_lengths,
                                  fraglengths=fraglengths, mode='ANIblastall')
    except ZeroDivisionError:
        logging.error("One or more BLAST output files has a problem.")
        if 0 < cumval:
            logging.error("This is possibly due to BLASTN run failure, " +
                         "please investigate")
        else:
            logging.error("This is possibly due to ara BLASTN comparison " +
                         "being too distant for use.")
        logging.error(last_exception())
    return data[1]
def run_blast(args: Namespace, logger: Logger, infiles: List[Path],
              blastdir: Path) -> Tuple:
    """Run BLAST commands for ANIb methods.

    :param args:  Namespace of command-line options
    :param logger:  logging object
    :param infiles:  iterable of sequence files to compare
    :param blastdir:  path of directory to fragment BLASTN databases

    Runs BLAST database creation and comparisons, returning the cumulative
    return values of the BLAST tool subprocesses, and the fragment sizes for
    each input file
    """
    if not args.skip_blastn:
        logger.info("Fragmenting input files, and writing to %s",
                    args.outdirname)
        fragfiles, fraglengths = make_sequence_fragments(
            args, logger, infiles, blastdir)

        # Run BLAST database-building and executables from a jobgraph
        logger.info("Creating job dependency graph")
        jobgraph = anib.make_job_graph(
            infiles, fragfiles,
            anib.make_blastcmd_builder(args.method, blastdir))
        if args.scheduler == "multiprocessing":
            logger.info("Running dependency graph with multiprocessing")
            cumval = run_mp.run_dependency_graph(jobgraph, logger=logger)
            if cumval > 0:
                logger.warning(
                    f"At least one BLAST run failed. {args.method} may fail. Please investigate."
                )
            else:
                logger.info("All multiprocessing jobs complete.")
        elif args.scheduler == "SGE":
            logger.info("Running dependency graph with SGE")
            run_sge.run_dependency_graph(jobgraph)
        else:
            logger.error(
                f"Scheduler {args.scheduler} not recognised (exiting)")
            raise SystemError(1)
    else:
        logger.warning("Skipping BLASTN runs (as instructed)!")
        # Import fragment lengths from JSON
        if args.method == "ANIblastall":
            fragpath = blastdir / "fraglengths.json"
            logger.info(f"Loading sequence fragments from {fragpath}")
            with open(fragpath, "rU") as ifh:
                fraglengths = json.load(ifh)
        else:
            fraglengths = dict()

    return cumval, fraglengths
Esempio n. 12
0
    def test_anib_concordance(self):
        """ANIb results concordant with JSpecies.

        We expect ANIb results to be quite different, as the BLASTN
        algorithm changed substantially between BLAST and BLAST+
        """
        # Perform ANIb on the input directory contents
        outdir = os.path.join(self.outdir, "blastn")
        os.makedirs(outdir, exist_ok=True)
        fragfiles, fraglengths = anib.fragment_fasta_files(
            self.infiles, outdir, self.fragsize)
        jobgraph = anib.make_job_graph(
            self.infiles, fragfiles,
            anib.make_blastcmd_builder("ANIb", outdir))
        assert_equal(0, run_mp.run_dependency_graph(jobgraph))
        results = anib.process_blast(outdir,
                                     self.orglengths,
                                     fraglengths,
                                     mode="ANIb")
        result_pid = results.percentage_identity
        result_pid.to_csv(os.path.join(self.outdir, "pyani_anib.tab"),
                          sep="\t")

        # Compare JSpecies output to results. We do this in two blocks,
        # masked according to whether the expected result is greater than
        # 90% identity, or less than that threshold.
        # The complete difference matrix is written to output, though
        result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0
        lo_result = result_pid.mask(result_pid >= 90).fillna(0)
        hi_result = result_pid.mask(result_pid < 90).fillna(0)
        lo_target = self.target["ANIb"].mask(
            self.target["ANIb"] >= 90).fillna(0)
        hi_target = self.target["ANIb"].mask(
            self.target["ANIb"] < 90).fillna(0)
        lo_diffmat = lo_result.as_matrix() - lo_target.as_matrix()
        hi_diffmat = hi_result.as_matrix() - hi_target.as_matrix()
        diffmat = result_pid.as_matrix() - self.target["ANIb"].as_matrix()
        lo_diff = pd.DataFrame(lo_diffmat,
                               index=result_pid.index,
                               columns=result_pid.columns)
        hi_diff = pd.DataFrame(hi_diffmat,
                               index=result_pid.index,
                               columns=result_pid.columns)
        anib_diff = pd.DataFrame(diffmat,
                                 index=result_pid.index,
                                 columns=result_pid.columns)
        anib_diff.to_csv(os.path.join(self.outdir, "pyani_anib_diff.tab"),
                         sep="\t")
        assert_less(lo_diff.abs().values.max(), self.tolerance["ANIb_lo"])
        assert_less(hi_diff.abs().values.max(), self.tolerance["ANIb_hi"])
Esempio n. 13
0
def test_anib_concordance(
    paths_concordance_fna,
    path_concordance_jspecies,
    tolerance_anib_hi,
    tolerance_anib_lo,
    threshold_anib_lo_hi,
    fragment_length,
    tmp_path,
):
    """Check ANIb results are concordant with JSpecies.

    We expect ANIb results to be quite different, as the BLASTN
    algorithm changed substantially between BLAST and BLAST+ (the
    megaBLAST algorithm is now the default for BLASTN)
    """
    # Get lengths of input genomes
    orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna)

    # Build and run BLAST jobs
    fragfiles, fraglengths = anib.fragment_fasta_files(
        paths_concordance_fna, tmp_path, fragment_length
    )
    jobgraph = anib.make_job_graph(
        paths_concordance_fna, fragfiles, anib.make_blastcmd_builder("ANIb", tmp_path)
    )
    assert 0 == run_mp.run_dependency_graph(jobgraph)  # Jobs must run correctly

    # Process BLAST output
    result_pid = anib.process_blast(
        tmp_path, orglengths, fraglengths, mode="ANIb"
    ).percentage_identity

    # Compare JSpecies output to results. We do this in two blocks,
    # masked according to whether the expected result is greater than
    # a threshold separating "low" from "high" identity comparisons.
    result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0
    lo_result = result_pid.mask(result_pid >= threshold_anib_lo_hi).fillna(0).values
    hi_result = result_pid.mask(result_pid < threshold_anib_lo_hi).fillna(0).values

    tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIb"]
    lo_target = tgt_pid.mask(tgt_pid >= threshold_anib_lo_hi).fillna(0).values
    hi_target = tgt_pid.mask(tgt_pid < threshold_anib_lo_hi).fillna(0).values

    assert (lo_result - lo_target, hi_result - hi_target) == (
        pytest.approx(0, abs=tolerance_anib_lo),
        pytest.approx(0, abs=tolerance_anib_hi),
    )
Esempio n. 14
0
    def test_anib_concordance(self):
        """ANIb results concordant with JSpecies.

        We expect ANIb results to be quite different, as the BLASTN
        algorithm changed substantially between BLAST and BLAST+
        """
        # Perform ANIb on the input directory contents
        outdir = os.path.join(self.outdir, "blastn")
        os.makedirs(outdir, exist_ok=True)
        fragfiles, fraglengths = anib.fragment_fasta_files(
            self.infiles, outdir, self.fragsize
        )
        jobgraph = anib.make_job_graph(
            self.infiles, fragfiles, anib.make_blastcmd_builder("ANIb", outdir)
        )
        assert_equal(0, run_mp.run_dependency_graph(jobgraph))
        results = anib.process_blast(outdir, self.orglengths, fraglengths, mode="ANIb")
        result_pid = results.percentage_identity
        result_pid.to_csv(os.path.join(self.outdir, "pyani_anib.tab"), sep="\t")

        # Compare JSpecies output to results. We do this in two blocks,
        # masked according to whether the expected result is greater than
        # 90% identity, or less than that threshold.
        # The complete difference matrix is written to output, though
        result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0
        lo_result = result_pid.mask(result_pid >= 90).fillna(0)
        hi_result = result_pid.mask(result_pid < 90).fillna(0)
        lo_target = self.target["ANIb"].mask(self.target["ANIb"] >= 90).fillna(0)
        hi_target = self.target["ANIb"].mask(self.target["ANIb"] < 90).fillna(0)
        lo_diffmat = lo_result.values - lo_target.values
        hi_diffmat = hi_result.values - hi_target.values
        diffmat = result_pid.values - self.target["ANIb"].values
        lo_diff = pd.DataFrame(
            lo_diffmat, index=result_pid.index, columns=result_pid.columns
        )
        hi_diff = pd.DataFrame(
            hi_diffmat, index=result_pid.index, columns=result_pid.columns
        )
        anib_diff = pd.DataFrame(
            diffmat, index=result_pid.index, columns=result_pid.columns
        )
        anib_diff.to_csv(os.path.join(self.outdir, "pyani_anib_diff.tab"), sep="\t")
        assert_less(lo_diff.abs().values.max(), self.tolerance["ANIb_lo"])
        assert_less(hi_diff.abs().values.max(), self.tolerance["ANIb_hi"])
def calculate_anim(args: Namespace, infiles: List[Path],
                   org_lengths: Dict) -> pyani_tools.ANIResults:
    """Return ANIm result dataframes for files in input directory.

    :param args:  Namespace, command-line arguments
    :param logger: logging object
    :param infiles:  list of paths to each input file
    :param org_lengths:  dict, input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger = logging.getLogger(__name__)

    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    deltadir = args.outdirname / ALIGNDIR["ANIm"]
    logger.info("Writing nucmer output to %s", deltadir)
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        joblist = anim.generate_nucmer_jobs(
            infiles,
            args.outdirname,
            nucmer_exe=args.nucmer_exe,
            filter_exe=args.filter_exe,
            maxmatch=args.maxmatch,
            jobprefix=args.jobprefix,
        )
        if args.scheduler == "multiprocessing":
            logger.info("Running jobs with multiprocessing")
            if args.workers is None:
                logger.info(
                    "(using maximum number of available worker threads)")
            else:
                logger.info("(using %d worker threads, if available)",
                            args.workers)
            cumval = run_mp.run_dependency_graph(joblist,
                                                 workers=args.workers,
                                                 logger=logger)
            logger.info("Cumulative return value: %d", cumval)
            if cumval > 0:
                logger.warning(
                    "At least one NUCmer comparison failed. ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            logger.info("Jobarray group size set to %d", args.sgegroupsize)
            run_sge.run_dependency_graph(
                joblist,
                jgprefix=args.jobprefix,
                sgegroupsize=args.sgegroupsize,
                sgeargs=args.sgeargs,
            )
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    results = anim.process_deltadir(deltadir, org_lengths, logger=logger)
    if results.zero_error:  # zero percentage identity error
        if not args.skip_nucmer and args.scheduler == "multiprocessing":
            if cumval > 0:
                logger.error(
                    "This has possibly been a NUCmer run failure, please investigate",
                    exc_info=True,
                )
                raise SystemExit(1)
            logger.error(
                "This is possibly due to:\n\t(i) a NUCmer comparison being too distant "
                "for use (please consider using the --maxmatch option)\n\t(ii) NUCmer run "
                "failure (analysis will continue, but please investigate)")
    if not args.nocompress:
        logger.info("Compressing/deleting %s", deltadir)
        compress_delete_outdir(deltadir, logger)

    # Return processed data from .delta files
    return results
def unified_anib(infiles, org_lengths):
    """Calculate ANIb for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Calculates ANI by the ANIb method, as described in Goris et al. (2007)
    Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0. There are
    some minor differences depending on whether BLAST+ or legacy BLAST
    (BLASTALL) methods are used.

    All FASTA format files (selected by suffix) in the input directory are
    used to construct BLAST databases, placed in the output directory.
    Each file's contents are also split into sequence fragments of length
    options.fragsize, and the multiple FASTA file that results written to
    the output directory. These are BLASTNed, pairwise, against the
    databases.

    The BLAST output is interrogated for all fragment matches that cover
    at least 70% of the query sequence, with at least 30% nucleotide
    identity over the full length of the query sequence. This is an odd
    choice and doesn't correspond to the twilight zone limit as implied by
    Goris et al. We persist with their definition, however.  Only these
    qualifying matches contribute to the total aligned length, and total
    aligned sequence identity used to calculate ANI.

    The results are processed to give matrices of aligned sequence length
    (aln_lengths.tab), similarity error counts (sim_errors.tab), ANIs
    (perc_ids.tab), and minimum aligned percentage (perc_aln.tab) of
    each genome, for each pairwise comparison. These are written to the
    output directory in plain text tab-separated format.
    """
    logger.info("Running %s", args.method)
    blastdir = os.path.join(args.outdirname, ALIGNDIR[args.method])
    logger.info("Writing BLAST output to %s", blastdir)
    # Build BLAST databases and run pairwise BLASTN
    if not args.skip_blastn:
        # Make sequence fragments
        logger.info("Fragmenting input files, and writing to %s",
                    args.outdirname)
        # Fraglengths does not get reused with BLASTN
        fragfiles, fraglengths = anib.fragment_fasta_files(
            infiles, blastdir, args.fragsize)
        # Export fragment lengths as JSON, in case we re-run with --skip_blastn
        with open(os.path.join(blastdir, 'fraglengths.json'), 'w') as outfile:
            json.dump(fraglengths, outfile)

        # Which executables are we using?
        #if args.method == "ANIblastall":
        #    format_exe = args.formatdb_exe
        #    blast_exe = args.blastall_exe
        #else:
        #    format_exe = args.makeblastdb_exe
        #    blast_exe = args.blastn_exe

        # Run BLAST database-building and executables from a jobgraph
        logger.info("Creating job dependency graph")
        jobgraph = anib.make_job_graph(
            infiles, fragfiles,
            anib.make_blastcmd_builder(args.method, blastdir))
        #jobgraph = anib.make_job_graph(infiles, fragfiles, blastdir,
        #                               format_exe, blast_exe, args.method,
        #                               jobprefix=args.jobprefix)
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            logger.info("Running job dependency graph")
            cumval = run_mp.run_dependency_graph(jobgraph, logger=logger)
            if 0 < cumval:
                logger.warning(
                    "At least one BLAST run failed. " + "%s may fail.",
                    args.method)
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            run_sge.run_dependency_graph(jobgraph, logger=logger)
            logger.info("Running jobs with SGE")
    else:
        # Import fragment lengths from JSON
        if args.method == "ANIblastall":
            with open(os.path.join(blastdir, 'fraglengths.json'),
                      'rU') as infile:
                fraglengths = json.load(infile)
        else:
            fraglengths = None
        logger.warning("Skipping BLASTN runs (as instructed)!")

    # Process pairwise BLASTN output
    logger.info("Processing pairwise %s BLAST output.", args.method)
    try:
        data = anib.process_blast(blastdir,
                                  org_lengths,
                                  fraglengths=fraglengths,
                                  mode=args.method)
    except ZeroDivisionError:
        logger.error("One or more BLAST output files has a problem.")
        if not args.skip_blastn:
            if 0 < cumval:
                logger.error("This is possibly due to BLASTN run failure, " +
                             "please investigate")
            else:
                logger.error("This is possibly due to a BLASTN comparison " +
                             "being too distant for use.")
        logger.error(last_exception())
    if not args.nocompress:
        logger.info("Compressing/deleting %s", blastdir)
        compress_delete_outdir(blastdir)

    # Return processed BLAST data
    return data
def calculate_anim(infiles, org_lengths):
    """Returns ANIm result dataframes for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    deltadir = os.path.join(args.outdirname, ALIGNDIR['ANIm'])
    logger.info("Writing nucmer output to %s", deltadir)
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        joblist = anim.generate_nucmer_jobs(infiles,
                                            args.outdirname,
                                            nucmer_exe=args.nucmer_exe,
                                            jobprefix=args.jobprefix)
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            if args.workers is None:
                logger.info("(using maximum number of available " +
                            "worker threads)")
            else:
                logger.info("(using %d worker threads, if available)",
                            args.workers)
            cumval = run_mp.run_dependency_graph(joblist,
                                                 workers=args.workers,
                                                 logger=logger)
            logger.info("Cumulative return value: %d", cumval)
            if 0 < cumval:
                logger.warning("At least one NUCmer comparison failed. " +
                               "ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            logger.info("Jobarray group size set to %d", args.sgegroupsize)
            run_sge.run_dependency_graph(joblist,
                                         logger=logger,
                                         jgprefix=args.jobprefix,
                                         sgegroupsize=args.sgegroupsize)
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    results = anim.process_deltadir(deltadir, org_lengths, logger=logger)
    if results.zero_error:  # zero percentage identity error
        if not args.skip_nucmer and args.scheduler == 'multiprocessing':
            if 0 < cumval:
                logger.error("This has possibly been a NUCmer run failure, " +
                             "please investigate")
                logger.error(last_exception())
                sys.exit(1)
            else:
                logger.error("This is possibly due to a NUCmer comparison " +
                             "being too distant for use. Please consider " +
                             "using the --maxmatch option.")
                logger.error("This is alternatively due to NUCmer run " +
                             "failure, analysis will continue, but please " +
                             "investigate.")
    if not args.nocompress:
        logger.info("Compressing/deleting %s", deltadir)
        compress_delete_outdir(deltadir)

    # Return processed data from .delta files
    return results
Esempio n. 18
0
def calculate_anim(infiles, org_lengths):
    """Returns ANIm result dataframes for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    deltadir = os.path.join(args.outdirname, ALIGNDIR['ANIm'])
    logger.info("Writing nucmer output to %s" % deltadir)
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        joblist = anim.generate_nucmer_jobs(infiles, args.outdirname,
                                            nucmer_exe=args.nucmer_exe,
                                            maxmatch=args.maxmatch,
                                            jobprefix=args.jobprefix)
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            if args.workers is None:
                logger.info("(using maximum number of available " +
                            "worker threads)")
            else:
                logger.info("(using %d worker threads, if available)" %
                            args.workers)
            cumval = run_mp.run_dependency_graph(joblist,
                                                 workers=args.workers, 
                                                 verbose=args.verbose,
                                                 logger=logger)
            logger.info("Cumulative return value: %d" % cumval)
            if 0 < cumval:
                logger.warning("At least one NUCmer comparison failed. " +
                               "ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            logger.info("Jobarray group size set to %d" % args.sgegroupsize)
            run_sge.run_dependency_graph(joblist, verbose=args.verbose,
                                         logger=logger,
                                         jgprefix=args.jobprefix,
                                         sgegroupsize=args.sgegroupsize)
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    data = anim.process_deltadir(deltadir, org_lengths, logger=logger)
    if data[-1]:  # zero percentage identity error
        if not args.skip_nucmer and args.scheduler == 'multiprocessing':
            if 0 < cumval:
                logger.error("This has possibly been a NUCmer run failure, " +
                             "please investigate")
                logger.error(last_exception())
                sys.exit(1)
            else:
                logger.error("This is possibly due to a NUCmer comparison " +
                             "being too distant for use. Please consider " +
                             "using the --maxmatch option.")
                logger.error("This is alternatively due to NUCmer run " +
                             "failure, analysis will continue, but please " +
                             "investigate.")
    if not args.nocompress:
        logger.info("Compressing/deleting %s" % deltadir)
        compress_delete_outdir(deltadir)

    # Return processed data from .delta files
    return tuple(data[:-1])
def unified_anib(infiles, org_lengths):
    """Calculate ANIb for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Calculates ANI by the ANIb method, as described in Goris et al. (2007)
    Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0. There are
    some minor differences depending on whether BLAST+ or legacy BLAST
    (BLASTALL) methods are used.

    All FASTA format files (selected by suffix) in the input directory are
    used to construct BLAST databases, placed in the output directory.
    Each file's contents are also split into sequence fragments of length
    options.fragsize, and the multiple FASTA file that results written to
    the output directory. These are BLASTNed, pairwise, against the
    databases.

    The BLAST output is interrogated for all fragment matches that cover
    at least 70% of the query sequence, with at least 30% nucleotide
    identity over the full length of the query sequence. This is an odd
    choice and doesn't correspond to the twilight zone limit as implied by
    Goris et al. We persist with their definition, however.  Only these
    qualifying matches contribute to the total aligned length, and total
    aligned sequence identity used to calculate ANI.

    The results are processed to give matrices of aligned sequence length
    (aln_lengths.tab), similarity error counts (sim_errors.tab), ANIs
    (perc_ids.tab), and minimum aligned percentage (perc_aln.tab) of
    each genome, for each pairwise comparison. These are written to the
    output directory in plain text tab-separated format.
    """
    logger.info("Running %s" % args.method)
    # Build BLAST databases and run pairwise BLASTN
    if not args.skip_blastn:
        # Make sequence fragments
        logger.info("Fragmenting input files, and writing to %s" %
                    args.outdirname)
        # Fraglengths does not get reused with BLASTN
        fragfiles, fraglengths = anib.fragment_FASTA_files(infiles,
                                                           args.outdirname,
                                                           args.fragsize)
        # Export fragment lengths as JSON, in case we re-run BLASTALL with
        # --skip_blastn
        if args.method == "ANIblastall":
            with open(os.path.join(args.outdirname,
                                   'fraglengths.json'), 'w') as outfile:
                json.dump(fraglengths, outfile)

        # Which executables are we using?
        if args.method == "ANIblastall":
            format_exe = args.formatdb_exe
            blast_exe = args.blastall_exe
        else:
            format_exe = args.makeblastdb_exe
            blast_exe = args.blastn_exe

        # Run BLAST database-building and executables from a jobgraph
        logger.info("Creating job dependency graph")
        jobgraph = anib.make_job_graph(infiles, fragfiles, args.outdirname,
                                       format_exe, blast_exe, args.method)
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            logger.info("Running job dependency graph")
            cumval = run_mp.run_dependency_graph(jobgraph, verbose=args.verbose,
                                                 logger=logger)
            if 0 < cumval:
                logger.warning("At least one BLAST run failed. " +
                               "%s may fail." % args.method)
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            run_sge.run_dependency_graph(jobgraph, verbose=args.verbose,
                                         logger=logger)
            logger.info("Running jobs with SGE")
    else:
        # Import fragment lengths from JSON
        if args.method == "ANIblastall":
            with open(os.path.join(args.outdirname, 'fraglengths.json'),
                      'rU') as infile:
                fraglengths = json.load(infile)
        else:
            fraglengths = None
        logger.warning("Skipping BLASTN runs (as instructed)!")

    # Process pairwise BLASTN output
    logger.info("Processing pairwise %s BLAST output." % args.method)
    try:
        data = anib.process_blast(args.outdirname, org_lengths,
                                  fraglengths=fraglengths, mode=args.method)
    except ZeroDivisionError:
        logger.error("One or more BLAST output files has a problem.")
        if not args.skip_blastn:
            if 0 < cumval:
                logger.error("This is possibly due to BLASTN run failure, " +
                             "please investigate")
            else:
                logger.error("This is possibly due to a BLASTN comparison " +
                             "being too distant for use.")
        logger.error(last_exception())
    return data
def calculate_anim(infiles, org_lengths):
    """Returns ANIm result dataframes for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        joblist = anim.generate_nucmer_jobs(infiles,
                                            args.outdirname,
                                            nucmer_exe=args.nucmer_exe,
                                            maxmatch=args.maxmatch)
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            cumval = run_mp.run_dependency_graph(joblist,
                                                 verbose=args.verbose,
                                                 logger=logger)
            logger.info("Cumulative return value: %d" % cumval)
            if 0 < cumval:
                logger.warning("At least one NUCmer comparison failed. " +
                               "ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            run_sge.run_dependency_graph(joblist,
                                         verbose=args.verbose,
                                         logger=logger)
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    try:
        data = anim.process_deltadir(args.outdirname, org_lengths)
    except ZeroDivisionError:
        logger.error("One or more NUCmer output files has a problem.")
        if not args.skip_nucmer:
            if 0 < cumval:
                logger.error("This is possibly due to NUCmer run failure, " +
                             "please investigate")
            else:
                logger.error("This is possibly due to a NUCmer comparison " +
                             "being too distant for use. Please consider " +
                             "using the --maxmatch option.")
        logger.error(last_exception())
    return data