Example #1
0
def make_outdir(outdir: Path, force: bool, noclobber: bool) -> None:
    """Create output directory (allows for force and noclobber).

    :param outdir:  Path, path to output directory
    :param force:  bool, True if an existing directory will be reused
    :param noclobber:  bool, True if existing files are not overwritten

    The intended outcomes are:
    outdir doesn't exist: create outdir
    outdir exists: raise exception
    outdir exists, --force only: remove the directory tree
    outdir exists, --force --noclobber: continue with existing directory tree

    So long as the outdir is created with this function, we need only check
    for args.noclobber elsewhere to see how to proceed when a file exists.
    """
    # Create logger
    logger = logging.getLogger(__name__)
    logger.info("Creating output directory %s", outdir)

    if force:
        logger.warning(termcolor("Output directory overwrite forced", "red"))
        if outdir.is_dir() and noclobber is False:
            logger.warning(
                termcolor("Clobbering existing directory %s", "red"), outdir)
            shutil.rmtree(outdir)
    outdir.mkdir(parents=True, exist_ok=force)
Example #2
0
def add_log_headers():
    """Add headers to log output."""
    logger = logging.getLogger(__name__)

    # Add citation information to log
    logger.info(termcolor("CITATION INFO", bold=True))
    for line in CITATION_INFO:
        logger.info(line)

    # Add dependency citations
    logger.info(termcolor("DEPENDENCIES", bold=True))
    dep_citations = [
        "The authors of pyani gratefully acknowledge its dependence on",
        "the following bioinformatics software:",
        f"\t{termcolor('MUMmer3', 'cyan')}: S. Kurtz, A. Phillippy, A.L. Delcher, M. Smoot, M. Shumway,",
        "\tC. Antonescu, and S.L. Salzberg (2004), 'Versatile and open software",
        "\tfor comparing large genomes' Genome Biology 5:R12",
        f"\t{termcolor('BLAST+', 'cyan')}: Camacho C., Coulouris G., Avagyan V., Ma N., Papadopoulos J.,",
        "\tBealer K., & Madden T.L. (2008) 'BLAST+: architecture and applications.'",
        "\tBMC Bioinformatics 10:421.",
        f"\t{termcolor('BLAST', 'cyan')}: Altschul, S.F., Madden, T.L., Schäffer, A.A., Zhang, J.,",
        "\tZhang, Z., Miller, W. & Lipman, D.J. (1997) 'Gapped BLAST and PSI-BLAST:",
        "\ta new generation of protein database search programs.' Nucleic Acids Res.",
        "\t25:3389-3402",
        f"\t{termcolor('Biopython', 'cyan')}: C**k PA, Antao T, Chang JT, Chapman BA, Cox CJ, Dalke A,",
        "\tFriedberg I, Hamelryck T, Kauff F, Wilczynski B and de Hoon MJL",
        "\t(2009) Biopython: freely available Python tools for computational",
        "\tmolecular biology and bioinformatics. Bioinformatics, 25, 1422-1423",
    ]
    for line in dep_citations:
        logger.info(line)
Example #3
0
def subcmd_download(args: Namespace) -> int:
    """Download assembled genomes in subtree of passed NCBI taxon ID.

    :param args:  Namespace, command-line arguments
    """
    # Create logger
    logger = logging.getLogger(__name__)
    logger.info(termcolor("Downloading genomes from NCBI", "red"))

    # Create output directory, respecting force/noclobber
    if args.dryrun:
        logger.warning(
            termcolor("Dry run only: will not overwrite or download", "cyan"))
    else:
        make_outdir(args.outdir, args.force, args.noclobber)

    api_key = configure_entrez(args)  # set up email/get API key
    asm_dict = get_tax_asm_dict(
        args)  # dictionary of assembly UIDs for download

    # Download contigs and hashes for each assembly UID in the dictionary
    # Collect class and label information for each downloaded genome, plus a list
    # of skipped genome data
    classes, labels, skippedlist = download_data(args, api_key, asm_dict)

    # Write class and label files
    if not args.dryrun:
        classfname = args.outdir / args.classfname
        logger.info("Writing classes file to %s", classfname)
        if classfname.exists() and args.noclobber:
            logger.warning("Class file %s exists, not overwriting", classfname)
        else:
            with open(classfname, "w") as ofh:
                ofh.write("\n".join(classes) + "\n")

        labelfname = args.outdir / args.labelfname
        logger.info("Writing labels file to %s", labelfname)
        if labelfname.exists() and args.noclobber:
            logger.warning("Labels file %s exists, not overwriting",
                           labelfname)
        else:
            with open(labelfname, "w") as ofh:
                ofh.write("\n".join(labels) + "\n")

    # Report skipped genome list
    if skippedlist:
        logger.warning(termcolor("%s genome downloads were skipped", "red"),
                       len(skippedlist))
        for skipped in skippedlist:
            outstr = "\n\t".join([
                f"taxon id: {skipped.taxon_id}",
                f"accession: {skipped.accession}",
                f"URL: {skipped.url}",
                f"source: {skipped.dltype}",
            ])
            logger.warning("%s %s:\n\t%s", skipped.organism, skipped.strain,
                           outstr)

    return 0
Example #4
0
def download_data(
    args: Namespace,
    api_key: Optional[str],
    asm_dict: Dict[str, List],
) -> Tuple[List, List, List]:
    """Download the accessions indicated in the passed dictionary.

    :param args:  Namespace of command-line arguments
    :param api_key:  str, API key for NCBI downloads
    :param asm_dict:  dictionary of assembly UIDs to download, keyed by taxID

    Returns lists of information about downloaded genome classes and labels, and a
    list of skipped downloads (as Skipped objects).
    """
    logger = logging.getLogger(__name__)

    classes, labels, skippedlist = [], [], []

    for tid, uids in asm_dict.items():
        logger.info(termcolor("Downloading contigs for Taxon ID %s", "blue"),
                    uids)
        for uid in uids:
            # Obtain eSummary for each assembly UID
            logger.info(
                termcolor("Retrieving eSummary information for UID %s",
                          "cyan"), uid)
            esummary, filestem = download.get_ncbi_esummary(
                uid, args.retries, api_key)
            uid_class = download.get_ncbi_classification(esummary)
            logger.debug(
                "eSummary information (%s):\n\t%s",
                filestem,
                dl_info_to_str(esummary, uid_class),
            )
            if args.dryrun:
                logger.warning("(dry-run) skipping download of %s",
                               esummary["AssemblyAccession"])
                continue

            # Download genome for UID, and extract compressed files
            dlstatus, skipped_genomes = download_genome(
                args, filestem, tid, uid, uid_class)
            skippedlist.extend(skipped_genomes)
            if not dlstatus.skipped:
                extract_genomes(args, dlstatus, esummary)
                labeltxt, classtxt = hash_genomes(args, dlstatus, filestem,
                                                  uid_class)
                classes.append(classtxt)
                labels.append(labeltxt)
                logger.info(
                    "Label and class file entries\n\tLabel: %s\n\tClass: %s",
                    labeltxt,
                    classtxt,
                )

    return classes, labels, skippedlist
Example #5
0
def get_ncbi_esummary(asm_uid, retries, api_key=None) -> Tuple:
    """Obtain full eSummary info for the passed assembly UID.

    :param asm_uid:
    :param retries:
    :param api_key:
    """
    logger = logging.getLogger(__name__)

    # Obtain full eSummary data for the assembly
    summary = entrez_esummary(
        retries=retries, db="assembly", id=asm_uid, report="full", api_key=api_key
    )

    # Extract filestem from assembly data
    try:
        data = summary["DocumentSummarySet"]["DocumentSummary"][0]
    except (IndexError, KeyError):
        # Something has gone awry with the download
        logger.warning(
            termcolor("Could not get eSummary for UID %s", "red"),
            asm_uid,
            exc_info=True,
        )
        raise NCBIDownloadException(f"Could not get NCBI eSummary for UID {asm_uid}")

    filestem = extract_filestem(data)

    return (data, filestem)
Example #6
0
def subcmd_plot(args: Namespace) -> int:
    """Produce graphical output for an analysis.

    :param args:  Namespace of command-line arguments

    This is graphical output for representing the ANI analysis results, and
    takes the form of a heatmap, or heatmap with dendrogram.
    """
    logger = logging.getLogger(__name__)

    # Announce what's going on to the user
    logger.info(termcolor("Generating graphical output for analyses", "red"))
    logger.info("Writing output to: %s", args.outdir)
    os.makedirs(args.outdir, exist_ok=True)
    logger.info("Rendering method: %s", args.method)

    # Connect to database session
    logger.debug("Activating session for database: %s", args.dbpath)
    session = pyani_orm.get_session(args.dbpath)

    # Parse output formats
    outfmts = args.formats.split(",")
    logger.debug("Requested output formats: %s", outfmts)

    # Work on each run:
    run_ids = [int(run) for run in args.run_id.split(",")]
    logger.debug("Generating graphics for runs: %s", run_ids)
    for run_id in run_ids:
        write_run_heatmaps(run_id, session, outfmts, args)

    return 0
Example #7
0
def run_main(argv: Optional[List[str]] = None) -> int:
    """Run main process for pyani.py script.

    :param argv:
    """
    # If we need to (i.e. a namespace isn't passed), parse the command-line
    if argv is None:
        args = parse_cmdline()
    else:
        args = parse_cmdline(argv)

    # Catch execution with no arguments
    if len(sys.argv) == 1:
        sys.stderr.write("pyani version: {0}\n".format(__version__))
        return 0

    # Set up logging
    time0 = time.time()
    logger = logging.getLogger(__name__)
    config_logger(args)

    # Boilerplate for log
    logger.info("Processed arguments: %s", args)
    args.cmdline = " ".join(sys.argv)
    logger.info("command-line: %s", args.cmdline)
    add_log_headers()

    # Run the subcommand
    returnval = args.func(args)
    logger.info(
        termcolor("Completed. Time taken: %.3f", bold=True), (time.time() - time0)
    )
    return returnval
Example #8
0
def add_log_headers():
    """Add headers to log output."""
    logger = logging.getLogger(__name__)

    # Add citation information to log
    logger.info(termcolor("CITATION INFO", bold=True))
    pyani_citation = [
        termcolor(
            "If you use pyani in your work, please cite the following publication:",
            "green",
        ),
        termcolor(
            "\tPritchard, L., Glover, R. H., Humphris, S., Elphinstone, J. G.,",
            "yellow",
        ),
        termcolor(
            "\t& Toth, I.K. (2016) 'Genomics and taxonomy in diagnostics for", "yellow"
        ),
        termcolor(
            "\tfood security: soft-rotting enterobacterial plant pathogens.'", "yellow"
        ),
        termcolor(
            "\tAnalytical Methods, 8(1), 12–24. http://doi.org/10.1039/C5AY02550H",
            "yellow",
        ),
    ]
    for line in pyani_citation:
        logger.info(line)

    # Add dependency citations
    logger.info(termcolor("DEPENDENCIES", bold=True))
    dep_citations = [
        "The authors of pyani gratefully acknowledge its dependence on",
        "the following bioinformatics software:",
        f"\t{termcolor('MUMmer3', 'cyan')}: S. Kurtz, A. Phillippy, A.L. Delcher, M. Smoot, M. Shumway,",
        "\tC. Antonescu, and S.L. Salzberg (2004), 'Versatile and open software",
        "\tfor comparing large genomes' Genome Biology 5:R12",
        f"\t{termcolor('BLAST+', 'cyan')}: Camacho C., Coulouris G., Avagyan V., Ma N., Papadopoulos J.,",
        "\tBealer K., & Madden T.L. (2008) 'BLAST+: architecture and applications.'",
        "\tBMC Bioinformatics 10:421.",
        f"\t{termcolor('BLAST', 'cyan')}: Altschul, S.F., Madden, T.L., Schäffer, A.A., Zhang, J.,",
        "\tZhang, Z., Miller, W. & Lipman, D.J. (1997) 'Gapped BLAST and PSI-BLAST:",
        "\ta new generation of protein database search programs.' Nucleic Acids Res.",
        "\t25:3389-3402",
        f"\t{termcolor('Biopython', 'cyan')}: C**k PA, Antao T, Chang JT, Chapman BA, Cox CJ, Dalke A,",
        "\tFriedberg I, Hamelryck T, Kauff F, Wilczynski B and de Hoon MJL",
        "\t(2009) Biopython: freely available Python tools for computational",
        "\tmolecular biology and bioinformatics. Bioinformatics, 25, 1422-1423",
    ]
    for line in dep_citations:
        logger.info(line)
Example #9
0
def get_tax_asm_dict(args: Namespace) -> Dict[str, List]:
    """Return dictionary of assembly UIDs to download, keyed by taxID.

    :param args:  Namespace of command-line arguments
    """
    logger = logging.getLogger(__name__)

    taxon_ids = download.split_taxa(args.taxon)
    logger.info(termcolor("Taxon IDs received: %s", "blue"), taxon_ids)
    asm_dict = download.make_asm_dict(taxon_ids, args.retries)
    for tid, uids in asm_dict.items():
        logger.debug(
            "Taxon ID summary\n\tQuery: %s\n\tasm count: %s\n\tUIDs: %s",
            tid,
            len(uids),
            uids,
        )
    return asm_dict
Example #10
0
def download_genome_and_hash(
    outdir: Path,
    timeout: int,
    dlfiledata: DLFileData,
    dltype: str = "RefSeq",
    disable_tqdm: bool = False,
) -> namedlist:
    """Download genome and accompanying MD5 hash from NCBI.

    :param args:  Namespace for command-line arguments
    :param outdir:  Path to output directory for downloads
    :param timeout:  int: timeout for download attempt
    :param dlfiledata:  namedtuple of info for file to download
    :param dltype:  reference database to use: RefSeq or GenBank
    :param disable_tqdm:  disable progress bar

    This function tries the (assumed to be passed) RefSeq FTP URL first and,
    if that fails, then attempts to download the corresponding GenBank data.

    We attempt to gracefully skip genomes with download errors.
    """
    # Create logger
    logger = logging.getLogger(__name__)

    if dltype == "GenBank":
        filestem = re.sub("^GCF_", "GCA_", dlfiledata.filestem)
    else:
        filestem = dlfiledata.filestem
    dlstatus = retrieve_genome_and_hash(
        filestem,
        dlfiledata.suffix,
        dlfiledata.ftpstem,
        outdir,
        timeout,
        disable_tqdm,
    )
    # Pylint is confused by the content of dlstatus (a namedlist)
    if dlstatus.error is not None:  # pylint: disable=no-member
        logger.warning(termcolor("%s download failed: skipping!", "magenta"),
                       dltype)
        logger.debug("Exception raised:\n%s", dlstatus.error)  # pylint: disable=no-member
        dlstatus.skipped = True

    return dlstatus  # pylint: disable=no-member
Example #11
0
def run_anim_jobs(joblist: List[ComparisonJob], args: Namespace) -> None:
    """Pass ANIm nucmer jobs to the scheduler.

    :param joblist:           list of ComparisonJob namedtuples
    :param args:              command-line arguments for the run
    """
    logger = logging.getLogger(__name__)
    logger.debug("Scheduler: %s", args.scheduler)

    if args.scheduler == "multiprocessing":
        logger.info("Running jobs with multiprocessing")
        if not args.workers:
            logger.debug("(using maximum number of worker threads)")
        else:
            logger.debug("(using %d worker threads, if available)",
                         args.workers)
        cumval = run_mp.run_dependency_graph([_.job for _ in joblist],
                                             workers=args.workers)
        if cumval > 0:
            logger.error(
                "At least one NUCmer comparison failed. Please investigate (exiting)"
            )
            raise PyaniException("Multiprocessing run failed in ANIm")
        logger.info("Multiprocessing run completed without error")
    elif args.scheduler.lower() == "sge":
        logger.info("Running jobs with SGE")
        logger.debug("Setting jobarray group size to %d", args.sgegroupsize)
        logger.debug("Joblist contains %d jobs", len(joblist))
        run_sge.run_dependency_graph(
            [_.job for _ in joblist],
            jgprefix=args.jobprefix,
            sgegroupsize=args.sgegroupsize,
            sgeargs=args.sgeargs,
        )
    else:
        logger.error(termcolor("Scheduler %s not recognised", "red"),
                     args.scheduler)
        raise SystemError(1)
Example #12
0
def subcmd_report(args: Namespace) -> int:
    """Present report on ANI results and/or database contents.

    :param args:  Namespace, command-line arguments

    The report subcommand takes any of several long options that do one of two
    things:

    1. perform a single action.
    2. set a parameter/format

    These will typically take an output path to a file or directory into which
    the report will be written (whatever form it takes). By default, text
    output is written in plain text format, but for some outputs this can
    be modified by an 'excel' or 'html' format specifier, which writes outputs
    in that format, where possible.
    """
    logger = logging.getLogger(__name__)

    # Output formats will apply across all tabular data requested
    # Expect comma-separated format arguments, and turn them into an iterable
    formats = process_formats(args)
    logger.info(termcolor("Creating report output in formats: %s", "red"),
                formats)

    # Declare which database is being used, and connect to session
    logger.debug("Using database: %s", args.dbpath)
    session = pyani_orm.get_session(args.dbpath)

    # Report runs in the database
    if args.show_runs:
        statement = session.query(Run.run_id, Run.name, Run.method, Run.date,
                                  Run.cmdline).statement
        headers = ["run ID", "name", "method", "date run", "command-line"]
        report(args, session, formats, ReportParams("runs", statement,
                                                    headers))

    # Report genomes in the database
    if args.show_genomes:
        statement = session.query(
            Genome.genome_id,
            Genome.description,
            Genome.path,
            Genome.genome_hash,
            Genome.length,
        ).statement
        headers = [
            "genome ID", "description", "path", "MD5 hash", "genome length"
        ]
        report(args, session, formats,
               ReportParams("genomes", statement, headers))

    # Report table of all genomes used for each run
    if args.show_runs_genomes:
        statement = (session.query(
            Run.run_id,
            Run.name,
            Run.method,
            Run.date,
            Genome.genome_id,
            Genome.description,
            Genome.path,
            Genome.genome_hash,
            Label.label,
            Label.class_label,
        ).join(rungenome, Genome.genome_id == rungenome.c.genome_id).join(
            Label,
            and_(Genome.genome_id == Label.genome_id,
                 Run.run_id == Label.run_id),
        ).order_by(Run.run_id, Genome.genome_id).statement)
        headers = [
            "run ID",
            "run name",
            "method",
            "date run",
            "genome ID",
            "genome description",
            "genome path",
            "genome hash",
            "genome label",
            "genome class",
        ]
        report(
            args,
            session,
            formats,
            ReportParams("runs_genomes", statement, headers),
        )

    # Report table of all runs in which a genome is involved
    if args.show_genomes_runs:
        statement = (session.query(
            Genome.genome_id,
            Run.run_id,
            Genome.description,
            Genome.path,
            Genome.genome_hash,
            Label.label,
            Label.class_label,
            Run.name,
            Run.method,
            Run.date,
        ).join(rungenome, Run.run_id == rungenome.c.run_id).join(
            Label,
            and_(Genome.genome_id == Label.genome_id,
                 Run.run_id == Label.run_id),
        ).order_by(Genome.genome_id, Run.run_id).statement)
        headers = [
            "genome ID",
            "run ID",
            "genome description",
            "genome path",
            "genome hash",
            "genome label",
            "genome class",
            "run name",
            "method",
            "date run",
        ]
        report(
            args,
            session,
            formats,
            ReportParams("genomes_runs", statement, headers),
        )

    # Report table of comparison results for the indicated runs
    if args.run_results:
        run_ids = [run_id.strip() for run_id in args.run_results.split(",")]
        logger.debug("Attempting to write results tables for runs: %s",
                     run_ids)
        for run_id in run_ids:
            logger.debug("Processing run ID %s", run_id)
            genome_query = aliased(Genome, name="genome_query")
            genome_subject = aliased(Genome, name="genome_subject")
            statement = (session.query(
                Comparison.comparison_id,
                Comparison.query_id,
                genome_query.description,
                Comparison.subject_id,
                genome_subject.description,
                Comparison.identity,
                Comparison.cov_query,
                Comparison.cov_subject,
                Comparison.aln_length,
                Comparison.sim_errs,
                Comparison.program,
                Comparison.version,
                Comparison.fragsize,
                Comparison.maxmatch,
                Run.run_id,
            ).join(
                genome_query,
                Comparison.query_id == genome_query.genome_id).join(
                    genome_subject,
                    Comparison.subject_id == genome_subject.genome_id).filter(
                        Run.run_id == run_id).statement)
            headers = [
                "Comparison ID",
                "Query ID",
                "Query description",
                "Subject ID",
                "Subject description",
                "% identity",
                "% query coverage",
                "% subject coverage",
                "alignment length",
                "similarity errors",
                "program",
                "version",
                "fragment size",
                "maxmatch",
                "Run ID",
            ]
            report(
                args,
                session,
                formats,
                ReportParams(f"results_{run_id}", statement, headers),
            )

    # Report matrices of comparison results for the indicated runs
    # For ANIm, all results other than coverage are symmetric matrices,
    # so we only get results in the forward direction.
    # As we need to pull down the matrices as Pandas dataframes by reading from
    # JSON, we don't bother with a helper function like report(), and write out
    # our matrices directly, here
    if args.run_matrices:
        for run_id in [
                run_id.strip() for run_id in args.run_matrices.split(",")
        ]:
            logger.debug("Extracting matrices for run %s", run_id)
            run = session.query(Run).filter(Run.run_id == run_id).first()
            matlabel_dict = get_matrix_labels_for_run(session, run_id)
            for matdata in [
                    MatrixData(*_) for _ in [
                        ("identity", run.df_identity, {
                            "colour_num": 0.95
                        }),
                        ("coverage", run.df_coverage, {
                            "colour_num": 0.95
                        }),
                        ("aln_lengths", run.df_alnlength, {}),
                        ("sim_errors", run.df_simerrors, {}),
                        ("hadamard", run.df_hadamard, {}),
                    ]
            ]:
                logger.debug("Writing %s results", matdata.name)
                matrix = pd.read_json(matdata.data)
                # Matrix rows and columns are labelled if there's a label dictionary,
                # and take the dataframe index otherwise
                matrix = label_results_matrix(matrix, matlabel_dict)
                pyani_report.write_dbtable(
                    matrix,
                    Path("_".join([
                        str(args.outdir / "matrix"), matdata.name,
                        str(run_id)
                    ])),
                    formats,
                    show_index=True,
                    **matdata.graphic_args,
                )

    return 0
Example #13
0
def subcmd_anib(args: Namespace) -> None:
    """Perform ANIb on all genome files in an input directory.

    :param args:  Namespace, command-line arguments

    Finds ANI by the ANIb method, as described in Goris J, Konstantinidis KT,
    Klappenbach JA, Coenye T, Vandamme P, et al. (2007) DNA-DNA hybridization
    values and their relationship to whole-genome sequence similarities.
    Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0.

    All FASTA format files (selected by suffix) in the input directory are
    fragmented into (by default 1020nt) consecutive sections, and a BLAST+
    database constructed from the whole genome input. The BLAST+ blastn tool
    is then used to query each set of fragments against each BLAST+ database,
    in turn.

    For each query, the BLAST+ .tab output is parsed to obtain alignment length,
    identity and similarity error count. Alignments below a threshold are not
    included in the calculation (this introduces systematic bias with respect to
    ANIm). The results are processed to calculate the ANI percentages, coverage,
    and similarity error.

    The calculated values are stored in the local SQLite3 database.
    """
    logger = logging.getLogger(__name__)

    logger.info(termcolor("Running ANIm analysis",
                          "red"))  # announce that we're starting

    # Get BLAST+ version - this will be used in the database entries
    blastn_version = anib.get_version(args.blastn_exe)
    logger.info(termcolor("BLAST+ blastn version: %s", "cyan"), blastn_version)

    # Use provided name, or make new one for this analysis
    start_time = datetime.datetime.now()
    name = args.name or "_".join(["ANIb", start_time.isoformat()])
    logger.info("Analysis name: %s", name)

    # Connect to existing database (which may be "clean" or have old analyses)
    logger.debug("Connecting to database %s", args.dbpath)
    try:
        session = get_session(args.dbpath)
    except Exception:
        logger.error("Could not connect to database %s (exiting)",
                     args.dbpath,
                     exc_info=True)
        raise SystemExit(1)

    # Add information about this run to the database
    logger.debug("Adding run info to database %s...", args.dbpath)
    try:
        run = add_run(
            session,
            method="ANIb",
            cmdline=args.cmdline,
            date=start_time,
            status="started",
            name=name,
        )
    except PyaniORMException:
        logger.error("Could not add run to the database (exiting)",
                     exc_info=True)
        raise SystemExit(1)
    logger.debug("\t...added run ID: %s to the database", run)

    # Identify input files for comparison, and populate the database
    logger.debug("Adding files for %s to database...", run)
    try:
        genome_ids = add_run_genomes(session, run, args.indir, args.classes,
                                     args.labels)
    except PyaniORMException:
        logger.error("Could not add genomes to database for run %s (exiting)",
                     run,
                     exc_info=True)
    logger.debug("\t...added genome IDs: %s", genome_ids)

    # Get list of genomes for this analysis from the database
    logger.info("Compiling genomes for comparison")
    genomes = run.genomes.all()
    logger.debug("\tCollected %s genomes for this run", len(genomes))

    # Create output directories. We create the main parent directory (args.outdir), but
    # also subdirectories for the BLAST databases,
    logger.debug("Creating output directory %s", args.outdir)
    try:
        os.makedirs(args.outdir, exist_ok=True)
    except IOError:
        logger.error(
            f"Could not create output directory {args.outdir} (exiting)",
            exc_info=True)
        raise SystemError(1)
    fragdir = Path(str(args.outdir)) / "fragments"
    blastdbdir = Path(str(args.outdir)) / "blastdbs"
    logger.debug("\t...creating subdirectories")
    os.makedirs(fragdir, exist_ok=True)
    os.makedirs(blastdbdir, exist_ok=True)

    # Create a new sequence fragment file and a new BLAST+ database for each input genome,
    # and add this data to the database as a row in BlastDB
    logger.info("Creating input sequence fragment files")
    for genome in genomes:
        fragpath, fraglengths = fragment_fasta_file(Path(str(genome.path)),
                                                    Path(str(fragdir)),
                                                    args.fragsize)
        print(fragpath, len(fraglengths))
        # blastdb = add_blastdb(
        #     session, genome, run, fragpath, dbpath, fraglengths, dbcmd
        # )

    raise NotImplementedError

    # Generate all pair permutations of genome IDs as a list of (Genome, Genome) tuples
    logger.info(
        "Compiling pairwise comparisons (this can take time for large datasets)..."
    )
    comparisons = list(
        permutations(tqdm(genomes, disable=args.disable_tqdm), 2))
    logger.info(
        f"\t...total parwise comparisons to be performed: {len(comparisons)}")

    # Check for existing comparisons; if one has already been done (for the same
    # software package, version, and setting) we add the comparison to this run,
    # but remove it from the list of comparisons to be performed
    logger.info("Checking database for existing comparison data...")
    comparisons_to_run = filter_existing_comparisons(session, run, comparisons,
                                                     "blastn", blastn_version,
                                                     args.fragsize, None)
    logger.info(
        f"\t...after check, still need to run {len(comparisons_to_run)} comparisons"
    )

    # If there are no comparisons to run, update the Run matrices and exit
    # from this function
    if not comparisons_to_run:
        logger.info(
            termcolor(
                "All comparison results present in database (skipping comparisons)",
                "magenta",
            ))
        logger.info("Updating summary matrices with existing results")
        update_comparison_matrices(session, run)
        return

    # If we are in recovery mode, we are salvaging output from a previous
    # run, and do not necessarily need to rerun all the jobs. In this case,
    # we prepare a list of output files we want to recover from the results
    # in the output directory.
    if args.recovery:
        logger.warning("Entering recovery mode...")
        logger.debug(
            "\tIn this mode, existing comparison output from %s is reused",
            args.outdir)
        existingfiles = collect_existing_output(args.outdir, "blastn", args)
        logger.debug("\tIdentified %s existing output files for reuse",
                     len(existingfiles))
    else:
        existingfiles = None
        logger.debug(f"\tIdentified no existing output files")

    # Split the input genome files into contiguous fragments of the specified size,
    # as described in Goris et al. We create a new directory to hold sequence
    # fragments, away from the main genomes
    logger.info("Splitting input genome files into %snt fragments...",
                args.fragsize)
    fragdir = Path(args.outdir) / "fragments"
    os.makedirs(fragdir, exist_ok=True)
    fragfiles, fraglens = anib.fragment_fasta_files(
        [Path(str(_.path)) for _ in genomes],
        Path(args.outdir) / "fragments",
        args.fragsize,
    )
    logger.debug("...wrote %s fragment files to %s", len(fragfiles), fragdir)

    # Create list of BLASTN jobs for each comparison still to be performed
    logger.info("Creating blastn jobs for ANIb...")
    joblist = generate_joblist(comparisons_to_run, existingfiles, fragfiles,
                               fraglens, args)
    logger.debug(f"...created %s blastn jobs", len(joblist))

    raise NotImplementedError
Example #14
0
def subcmd_anim(args: Namespace) -> None:
    """Perform ANIm on all genome files in an input directory.

    :param args:  Namespace, command-line arguments

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (whose path must
    be provided).

    For each pairwise comparison, the NUCmer .delta file output is parsed to
    obtain an alignment length and similarity error count for every unique
    region alignment between the two organisms, as represented by
    sequences in the FASTA files. These are processed to calculated aligned
    sequence lengths, average nucleotide identity (ANI) percentages, coverage
    (aligned percentage of whole genome - forward direction), and similarity
    error count for each pairwise comparison.

    The calculated values are deposited in the SQLite3 database being used for
    the analysis.

    For each pairwise comparison the NUCmer output is stored in the output
    directory for long enough to extract summary information, but for each run
    the output is gzip compressed. Once all runs are complete, the outputs
    for each comparison are concatenated into a single gzip archive.
    """
    # Create logger
    logger = logging.getLogger(__name__)

    # Announce the analysis
    logger.info(termcolor("Running ANIm analysis", bold=True))

    # Get current nucmer version
    nucmer_version = anim.get_version(args.nucmer_exe)
    logger.info(termcolor("MUMMer nucmer version: %s", "cyan"), nucmer_version)

    # Use the provided name or make one for the analysis
    start_time = datetime.datetime.now()
    name = args.name or "_".join(["ANIm", start_time.isoformat()])
    logger.info(termcolor("Analysis name: %s", "cyan"), name)

    # Get connection to existing database. This may or may not have data
    logger.debug("Connecting to database %s", args.dbpath)
    try:
        session = get_session(args.dbpath)
    except Exception:
        logger.error("Could not connect to database %s (exiting)",
                     args.dbpath,
                     exc_info=True)
        raise SystemExit(1)

    # Add information about this run to the database
    logger.debug("Adding run info to database %s...", args.dbpath)
    try:
        run = add_run(
            session,
            method="ANIm",
            cmdline=args.cmdline,
            date=start_time,
            status="started",
            name=name,
        )
    except PyaniORMException:
        logger.error("Could not add run %s to the database (exiting)",
                     run,
                     exc_info=True)
        raise SystemExit(1)
    logger.debug("...added run ID: %s to the database", run)

    # Identify input files for comparison, and populate the database
    logger.debug("Adding genomes for run %s to database...", run)
    try:
        genome_ids = add_run_genomes(session, run, args.indir, args.classes,
                                     args.labels)
    except PyaniORMException:
        logger.error("Could not add genomes to database for run %s (exiting)",
                     run)
        raise SystemExit(1)
    logger.debug("\t...added genome IDs: %s", genome_ids)

    # Generate commandlines for NUCmer analysis and output compression
    logger.info("Generating ANIm command-lines")
    deltadir = args.outdir / pyani_config.ALIGNDIR["ANIm"]
    logger.debug("NUCmer output will be written temporarily to %s", deltadir)

    # Create output directories
    logger.debug("Creating output directory %s", deltadir)
    try:
        deltadir.mkdir(exist_ok=True, parents=True)
    except IOError:
        logger.error("Could not create output directory %s (exiting)",
                     deltadir,
                     exc_info=True)
        raise SystemError(1)

    # Get list of genome IDs for this analysis from the database
    logger.info("Compiling genomes for comparison")
    genomes = run.genomes.all()
    logger.debug("Collected %s genomes for this run", len(genomes))

    # Generate all pair combinations of genome IDs as a list of (Genome, Genome) tuples
    logger.info(
        "Compiling pairwise comparisons (this can take time for large datasets)..."
    )
    comparisons = list(
        combinations(tqdm(genomes, disable=args.disable_tqdm), 2))
    logger.info("\t...total parwise comparisons to be performed: %s",
                len(comparisons))

    # Check for existing comparisons; if one has been done (for the same
    # software package, version, and setting) we add the comparison to this run,
    # but remove it from the list of comparisons to be performed
    logger.info("Checking database for existing comparison data...")
    comparisons_to_run = filter_existing_comparisons(session, run, comparisons,
                                                     "nucmer", nucmer_version,
                                                     None, args.maxmatch)
    logger.info("\t...after check, still need to run %s comparisons",
                len(comparisons_to_run))

    # If there are no comparisons to run, update the Run matrices and exit
    # from this function
    if not comparisons_to_run:
        logger.info(
            termcolor(
                "All comparison results present in database (skipping comparisons)",
                "magenta",
            ))
        logger.info("Updating summary matrices with existing results")
        update_comparison_matrices(session, run)
        return

    # If we are in recovery mode, we are salvaging output from a previous
    # run, and do not necessarily need to rerun all the jobs. In this case,
    # we prepare a list of output files we want to recover from the results
    # in the output directory.
    if args.recovery:
        logger.warning("Entering recovery mode")
        logger.debug(
            "\tIn this mode, existing comparison output from %s is reused",
            deltadir)
        existingfiles = collect_existing_output(deltadir, "nucmer", args)
        logger.debug("\tIdentified %s existing output files for reuse",
                     len(existingfiles))
    else:
        existingfiles = list()
        logger.debug("\tIdentified no existing output files")

    # Create list of NUCmer jobs for each comparison still to be performed
    logger.info("Creating NUCmer jobs for ANIm")
    joblist = generate_joblist(comparisons_to_run, existingfiles, args)
    logger.debug("Generated %s jobs, %s comparisons", len(joblist),
                 len(comparisons_to_run))

    # Pass jobs to appropriate scheduler
    logger.debug("Passing %s jobs to %s...", len(joblist), args.scheduler)
    run_anim_jobs(joblist, args)
    logger.info("...jobs complete")

    # Process output and add results to database
    # This requires us to drop out of threading/multiprocessing: Python's SQLite3
    # interface doesn't allow sharing connections and cursors
    logger.info("Adding comparison results to database...")
    update_comparison_results(joblist, run, session, nucmer_version, args)
    update_comparison_matrices(session, run)
    logger.info("...database updated.")
Example #15
0
def download_genome(args: Namespace, filestem: str, tid: str, uid: str,
                    uid_class):
    """Download single genome data to output directory.

    :param args:  Namespace, command-line arguments
    :param filestem:  str, output filestem
    :param tid:  str, taxonID
    :param uid:  str, assembly UID
    :param uid_class:
    """
    logger = logging.getLogger(__name__)

    skippedlist = []
    refseq_status, genbank_status = True, True  # set False if skipped

    dlfiledata = download.DLFileData(filestem,
                                     "ftp://ftp.ncbi.nlm.nih.gov/genomes/all",
                                     "genomic.fna.gz")
    logger.info("Retrieving URLs for %s", filestem)
    # Try RefSeq first
    dlstatus = download.download_genome_and_hash(
        args.outdir,
        args.timeout,
        dlfiledata,
        dltype="RefSeq",
        disable_tqdm=args.disable_tqdm,
    )
    # Pylint is confused by the content of dlstatus (a namedlist)
    if dlstatus.skipped:  # pylint: disable=no-member
        skippedlist.append(
            Skipped(
                tid,
                uid,
                uid_class.organism,
                uid_class.strain,
                dlstatus.url,  # pylint: disable=no-member
                "RefSeq",
            ))
        refseq_status = False

    # RefSeq fails, so try GenBank
    if refseq_status is False:
        logger.warning(
            termcolor("RefSeq failed. Trying GenBank alternative assembly",
                      "magenta"))
        # Try GenBank assembly
        dlstatus = download.download_genome_and_hash(
            args.outdir,
            args.timeout,
            dlfiledata,
            dltype="GenBank",
            disable_tqdm=args.disable_tqdm,
        )
        # Pylint is confused by the content of dlstatus (a namedlist)
        if dlstatus.skipped:  # pylint: disable=no-member
            skippedlist.append(
                Skipped(
                    tid,
                    uid,
                    uid_class.organism,
                    uid_class.strain,
                    dlstatus.url,
                    "GenBank",
                ))
            genbank_status = False
            logger.warning(termcolor("GenBank failed.", "magenta"))

    if genbank_status or refseq_status:
        # One of the downloads worked: report information
        logger.debug("Downloaded from URL: %s", dlstatus.url)
        logger.debug("Wrote assembly to: %s", dlstatus.outfname)
        logger.debug("Wrote MD5 hashes to: %s", dlstatus.outfhash)

        # Check hash for the download
        hashstatus = download.check_hash(dlstatus.outfname, dlstatus.outfhash)
        logger.debug("Local MD5 hash: %s", hashstatus.localhash)
        logger.debug("NCBI MD5 hash: %s", hashstatus.localhash)
        if hashstatus.passed:
            logger.info(termcolor("MD5 hash check passed", "green"))
        else:
            logger.warning("MD5 hash check failed. Please check and retry.")

    return dlstatus, skippedlist
Example #16
0
def subcmd_download(args: Namespace) -> int:
    """Download assembled genomes in subtree of passed NCBI taxon ID.

    :param args:  Namespace, command-line arguments
    """
    # Create logger
    logger = logging.getLogger(__name__)
    logger.info(termcolor("Downloading genomes from NCBI", "red"))

    # Create output directory, respecting force/noclobber
    if not args.dryrun:
        tools.make_outdir(args.outdir, args.force, args.noclobber)
    else:
        logger.warning(
            termcolor("Dry run only: will not overwrite or download", "cyan"))

    # Set Entrez email
    download.set_ncbi_email(args.email)
    logger.info("Setting Entrez email address: %s", args.email)

    # Parse Entrez API key, if provided
    api_path = args.api_keypath.expanduser()
    if not api_path.is_file():
        logger.warning("API path %s not a valid file. Not using API key.",
                       api_path)
        api_key = None
    else:
        api_key = download.parse_api_key(api_path)
        logger.info("API key recovered from %s", api_path)

    # Get list of taxon IDs to download
    taxon_ids = download.split_taxa(args.taxon)
    logger.info(termcolor("Taxon IDs received: %s", "blue"), taxon_ids)

    # Get assembly UIDs for each taxon
    asm_dict = tools.make_asm_dict(taxon_ids, args.retries)
    for tid, uids in asm_dict.items():
        logger.debug(
            "Taxon ID summary\n\tQuery: %s\n\tasm count: %s\n\tUIDs: %s",
            tid,
            len(uids),
            uids,
        )

    # Compile outputs to write class and label files, and a list of
    # skipped downloads (and define a helper tuple for collating skipped
    # genome information)
    classes = []
    labels = []
    skippedlist = []
    Skipped = namedtuple("Skipped",
                         "taxon_id accession organism strain url dltype")

    # Download contigs and hashes for each assembly UID in the list
    # On completion of this loop, each assembly in the list will either be
    # downloaded or skipped (with skipped genome information preserved in
    # skippedlist), and class/label info will be collated, ready for writing
    # to file.
    # Summary information is reported to the logger for each eSummary that
    # can be recovered
    for tid, uids in asm_dict.items():
        logger.info(termcolor("Downloading contigs for Taxon ID %s", "blue"),
                    uids)
        for uid in uids:
            # Obtain eSummary
            logger.info(
                termcolor("Retrieving eSummary information for UID %s",
                          "cyan"), uid)
            esummary, filestem = download.get_ncbi_esummary(
                uid, args.retries, api_key)
            uid_class = download.get_ncbi_classification(esummary)

            # Report summary
            outstr = "\n\t".join([
                f"Species Taxid: {esummary['SpeciesTaxid']}",
                f"TaxID: {esummary['Taxid']}",
                f"Accession: {esummary['AssemblyAccession']}",
                f"Name: {esummary['AssemblyName']}",
                f"Organism: {uid_class.organism}",
                f"Genus: {uid_class.genus}",
                f"Species: {uid_class.species}",
                f"Strain: {uid_class.strain}",
            ])
            logger.debug("eSummary information:\n\t%s", outstr)
            if args.dryrun:
                logger.warning("(dry-run) skipping download of %s",
                               esummary["AssemblyAccession"])
                continue

            # Obtain URLs, trying the RefSeq filestem first, then GenBank if
            # there's a failure
            dlfiledata = tools.DLFileData(
                filestem, "ftp://ftp.ncbi.nlm.nih.gov/genomes/all",
                "genomic.fna.gz")
            logger.info("Retrieving URLs for %s", filestem)
            # Try RefSeq first
            dlstatus = tools.download_genome_and_hash(
                args,
                dlfiledata,
                dltype="RefSeq",
                disable_tqdm=args.disable_tqdm,
            )
            # RefSeq failed, try GenBank
            # Pylint is confused by the content of dlstatus (a namedlist)
            if dlstatus.skipped:  # pylint: disable=no-member
                skippedlist.append(
                    Skipped(
                        tid,
                        uid,
                        uid_class.organism,
                        uid_class.strain,
                        dlstatus.url,  # pylint: disable=no-member
                        "RefSeq",
                    ))
                logger.warning(
                    "RefSeq failed. Trying GenBank alternative assembly")
                # Try GenBank assembly
                dlstatus = tools.download_genome_and_hash(
                    args,
                    dlfiledata,
                    dltype="GenBank",
                    disable_tqdm=args.disable_tqdm,
                )
                # Pylint is confused by the content of dlstatus (a namedlist)
                if dlstatus.skipped:  # pylint: disable=no-member
                    skippedlist.append(
                        Skipped(
                            tid,
                            uid,
                            uid_class.organism,
                            uid_class.strain,
                            dlstatus.url,
                            "GenBank",
                        ))
                    logger.warning("GenBank failed.")
                    continue  # Move straight on to the next download

            # One of the downloads worked: report information
            logger.debug("Downloaded from URL: %s", dlstatus.url)
            logger.debug("Wrote assembly to: %s", dlstatus.outfname)
            logger.debug("Wrote MD5 hashes to: %s", dlstatus.outfhash)

            # Check hash for the download
            hashstatus = download.check_hash(dlstatus.outfname,
                                             dlstatus.outfhash)
            logger.debug("Local MD5 hash: %s", hashstatus.localhash)
            logger.debug("NCBI MD5 hash: %s", hashstatus.localhash)
            if hashstatus.passed:
                logger.info(termcolor("MD5 hash check passed", "green"))
            else:
                logger.warning(
                    "MD5 hash check failed. Please check and retry.")

            # Extract downloaded files
            ename = dlstatus.outfname.with_suffix(
                "")  # should strip only last suffix
            if ename.exists() and args.noclobber:
                logger.warning("Output file %s exists, not extracting", ename)
            else:
                logger.debug("Extracting archive %s to %s", dlstatus.outfname,
                             ename)
                download.extract_contigs(dlstatus.outfname, ename)

            # Modify sequence ID header if Kraken option active
            if args.kraken:
                logger.warning(
                    "Modifying downloaded sequence for Kraken compatibility")
                seqdata = list(SeqIO.parse(ename, "fasta"))
                logger.debug("Modifying %s", ename)
                for seq in seqdata:
                    seq.id = "|".join(
                        [seq.id, "kraken:taxid", esummary["SpeciesTaxid"]])
                SeqIO.write(seqdata, ename, "fasta")

            # Create MD5 hash for the downloaded contigs
            logger.debug("Creating local MD5 hash for %s", ename)
            hashfname = ename.with_suffix(".md5")
            datahash = download.create_hash(ename)
            logger.debug("Writing hash to %s", hashfname)
            with open(hashfname, "w") as hfh:
                hfh.write("\t".join([datahash, str(ename)]) + "\n")
            # Make label/class text
            labeltxt, classtxt = download.create_labels(
                uid_class, filestem, datahash)
            classes.append(classtxt)
            labels.append(labeltxt)
            logger.info(
                "Label and class file entries\n\tLabel: %s\n\tClass: %s",
                labeltxt,
                classtxt,
            )

    # Write class and label files
    classfname = args.outdir / args.classfname
    logger.info("Writing classes file to %s", classfname)
    if classfname.exists() and args.noclobber:
        logger.warning("Class file %s exists, not overwriting", classfname)
    else:
        with open(classfname, "w") as ofh:
            ofh.write("\n".join(classes) + "\n")

    labelfname = args.outdir / args.labelfname
    logger.info("Writing labels file to %s", labelfname)
    if labelfname.exists() and args.noclobber:
        logger.warning("Labels file %s exists, not overwriting", labelfname)
    else:
        with open(labelfname, "w") as ofh:
            ofh.write("\n".join(labels) + "\n")

    # Report skipped genome list
    if skippedlist:
        logger.warning(termcolor("%s genome downloads were skipped", "red"),
                       len(skippedlist))
        for skipped in skippedlist:
            outstr = "\n\t".join([
                f"taxon id: {skipped.taxon_id}",
                f"accession: {skipped.accession}",
                f"URL: {skipped.url}",
                f"source: {skipped.dltype}",
            ])
            logger.warning("%s %s:\n\t%s", skipped.organism, skipped.strain,
                           outstr)

    return 0
Example #17
0
import logging
import sys
import time

from typing import List, Optional

from pyani.logger import config_logger
from pyani.pyani_tools import termcolor

from .parsers import parse_cmdline
from .. import __version__

CITATION_INFO = [
    termcolor(
        "If you use pyani in your work, please cite the following publication:",
        "green",
    ),
    termcolor(
        "\tPritchard, L., Glover, R. H., Humphris, S., Elphinstone, J. G.,",
        "yellow",
    ),
    termcolor(
        "\t& Toth, I.K. (2016) 'Genomics and taxonomy in diagnostics for",
        "yellow"),
    termcolor(
        "\tfood security: soft-rotting enterobacterial plant pathogens.'",
        "yellow"),
    termcolor(
        "\tAnalytical Methods, 8(1), 12–24. http://doi.org/10.1039/C5AY02550H",
        "yellow",
    ),