Beispiel #1
0
def subcmd_plot(args: Namespace, logger: Logger) -> int:
    """Produce graphical output for an analysis.

    :param args:  Namespace of command-line arguments
    :param logger:  logging object

    This is graphical output for representing the ANI analysis results, and
    takes the form of a heatmap, or heatmap with dendrogram.
    """
    # Announce what's going on to the user
    logger.info("Generating graphical output for analyses")
    logger.info(f"Writing output to: {args.outdir}")
    os.makedirs(args.outdir, exist_ok=True)
    logger.info(f"Rendering method: {args.method}")

    # Connect to database session
    logger.info(f"Activating session for database: {args.dbpath}")
    session = pyani_orm.get_session(args.dbpath)

    # Parse output formats
    outfmts = args.formats.split(",")
    logger.info(f"Requested output formats: {outfmts}")

    # Work on each run:
    run_ids = [int(run) for run in args.run_id.split(",")]
    logger.info(f"Generating graphics for runs: {run_ids}")
    for run_id in run_ids:
        write_run_heatmaps(run_id, session, outfmts, args, logger)

    return 0
Beispiel #2
0
def subcmd_plot(args: Namespace) -> int:
    """Produce graphical output for an analysis.

    :param args:  Namespace of command-line arguments

    This is graphical output for representing the ANI analysis results, and
    takes the form of a heatmap, or heatmap with dendrogram.
    """
    logger = logging.getLogger(__name__)

    # Announce what's going on to the user
    logger.info(termcolor("Generating graphical output for analyses", "red"))
    logger.info("Writing output to: %s", args.outdir)
    os.makedirs(args.outdir, exist_ok=True)
    logger.info("Rendering method: %s", args.method)

    # Connect to database session
    logger.debug("Activating session for database: %s", args.dbpath)
    session = pyani_orm.get_session(args.dbpath)

    # Parse output formats
    outfmts = args.formats.split(",")
    logger.debug("Requested output formats: %s", outfmts)

    # Work on each run:
    run_ids = [int(run) for run in args.run_id.split(",")]
    logger.debug("Generating graphics for runs: %s", run_ids)
    for run_id in run_ids:
        write_run_heatmaps(run_id, session, outfmts, args)

    return 0
Beispiel #3
0
def subcmd_classify(args: Namespace, logger: Logger) -> int:
    """Generate classifications for an analysis.

    :param args:  Namespace, command-line arguments
    :param logger:  logging object
    """
    # Tell the user what's going on
    logger.info(f"Generating classification for ANI run: {args.run_id}")
    logger.info(f"\tWriting output to: {args.outdir}")
    logger.info(f"\tCoverage threshold: {args.cov_min}")
    logger.info(f"\tInitial minimum identity threshold: {args.id_min}")

    # Get results data for the specified run
    logger.info(f"Acquiring results for run: {args.run_id}")
    logger.info(f"Connecting to database: {args.dbpath}")
    session = pyani_orm.get_session(args.dbpath)
    logger.info("Retrieving results matrices")
    results = (session.query(
        pyani_orm.Run).filter(pyani_orm.Run.run_id == args.run_id).first())
    result_label_dict = pyani_orm.get_matrix_labels_for_run(
        session, args.run_id)

    # Generate initial graph on basis of results
    logger.info("Constructing graph from results.")
    initgraph = pyani_classify.build_graph_from_results(
        results, result_label_dict, args.cov_min, args.id_min)
    logger.info(
        "Returned graph has %d nodes:\n\t%s",
        len(initgraph),
        "\n\t".join([n for n in initgraph]),
    )
    logger.info(
        "Initial graph clique information:\n\t%s",
        pyani_classify.analyse_cliques(initgraph),
    )

    # Obtain all subgraph splits, thresholding by identity
    subgraphs = trimmed_graph_sequence(initgraph, args)
    special_intervals = [_ for _ in subgraphs if _.cliqueinfo.all_k_complete]
    outstr = "\n\t".join(
        [f"{_.interval}\t{_.cliqueinfo}" for _ in special_intervals])
    logger.info(
        f"{len(special_intervals)} intervals with special property:\n\t{outstr}"
    )
    if args.show_all:
        outstr = "\n\t".join(
            [f"{_.interval}\t{_.cliqueinfo}" for _ in subgraphs])
        logger.info(f"Subgraphs at all identity thresholds:\n\t{outstr}")

    return 0
Beispiel #4
0
def subcmd_report(args: Namespace) -> int:
    """Present report on ANI results and/or database contents.

    :param args:  Namespace, command-line arguments

    The report subcommand takes any of several long options that do one of two
    things:

    1. perform a single action.
    2. set a parameter/format

    These will typically take an output path to a file or directory into which
    the report will be written (whatever form it takes). By default, text
    output is written in plain text format, but for some outputs this can
    be modified by an 'excel' or 'html' format specifier, which writes outputs
    in that format, where possible.
    """
    logger = logging.getLogger(__name__)

    # Output formats will apply across all tabular data requested
    # Expect comma-separated format arguments, and turn them into an iterable
    formats = process_formats(args)
    logger.info(termcolor("Creating report output in formats: %s", "red"),
                formats)

    # Declare which database is being used, and connect to session
    logger.debug("Using database: %s", args.dbpath)
    session = pyani_orm.get_session(args.dbpath)

    # Report runs in the database
    if args.show_runs:
        statement = session.query(Run.run_id, Run.name, Run.method, Run.date,
                                  Run.cmdline).statement
        headers = ["run ID", "name", "method", "date run", "command-line"]
        report(args, session, formats, ReportParams("runs", statement,
                                                    headers))

    # Report genomes in the database
    if args.show_genomes:
        statement = session.query(
            Genome.genome_id,
            Genome.description,
            Genome.path,
            Genome.genome_hash,
            Genome.length,
        ).statement
        headers = [
            "genome ID", "description", "path", "MD5 hash", "genome length"
        ]
        report(args, session, formats,
               ReportParams("genomes", statement, headers))

    # Report table of all genomes used for each run
    if args.show_runs_genomes:
        statement = (session.query(
            Run.run_id,
            Run.name,
            Run.method,
            Run.date,
            Genome.genome_id,
            Genome.description,
            Genome.path,
            Genome.genome_hash,
            Label.label,
            Label.class_label,
        ).join(rungenome, Genome.genome_id == rungenome.c.genome_id).join(
            Label,
            and_(Genome.genome_id == Label.genome_id,
                 Run.run_id == Label.run_id),
        ).order_by(Run.run_id, Genome.genome_id).statement)
        headers = [
            "run ID",
            "run name",
            "method",
            "date run",
            "genome ID",
            "genome description",
            "genome path",
            "genome hash",
            "genome label",
            "genome class",
        ]
        report(
            args,
            session,
            formats,
            ReportParams("runs_genomes", statement, headers),
        )

    # Report table of all runs in which a genome is involved
    if args.show_genomes_runs:
        statement = (session.query(
            Genome.genome_id,
            Run.run_id,
            Genome.description,
            Genome.path,
            Genome.genome_hash,
            Label.label,
            Label.class_label,
            Run.name,
            Run.method,
            Run.date,
        ).join(rungenome, Run.run_id == rungenome.c.run_id).join(
            Label,
            and_(Genome.genome_id == Label.genome_id,
                 Run.run_id == Label.run_id),
        ).order_by(Genome.genome_id, Run.run_id).statement)
        headers = [
            "genome ID",
            "run ID",
            "genome description",
            "genome path",
            "genome hash",
            "genome label",
            "genome class",
            "run name",
            "method",
            "date run",
        ]
        report(
            args,
            session,
            formats,
            ReportParams("genomes_runs", statement, headers),
        )

    # Report table of comparison results for the indicated runs
    if args.run_results:
        run_ids = [run_id.strip() for run_id in args.run_results.split(",")]
        logger.debug("Attempting to write results tables for runs: %s",
                     run_ids)
        for run_id in run_ids:
            logger.debug("Processing run ID %s", run_id)
            genome_query = aliased(Genome, name="genome_query")
            genome_subject = aliased(Genome, name="genome_subject")
            statement = (session.query(
                Comparison.comparison_id,
                Comparison.query_id,
                genome_query.description,
                Comparison.subject_id,
                genome_subject.description,
                Comparison.identity,
                Comparison.cov_query,
                Comparison.cov_subject,
                Comparison.aln_length,
                Comparison.sim_errs,
                Comparison.program,
                Comparison.version,
                Comparison.fragsize,
                Comparison.maxmatch,
                Run.run_id,
            ).join(
                genome_query,
                Comparison.query_id == genome_query.genome_id).join(
                    genome_subject,
                    Comparison.subject_id == genome_subject.genome_id).filter(
                        Run.run_id == run_id).statement)
            headers = [
                "Comparison ID",
                "Query ID",
                "Query description",
                "Subject ID",
                "Subject description",
                "% identity",
                "% query coverage",
                "% subject coverage",
                "alignment length",
                "similarity errors",
                "program",
                "version",
                "fragment size",
                "maxmatch",
                "Run ID",
            ]
            report(
                args,
                session,
                formats,
                ReportParams(f"results_{run_id}", statement, headers),
            )

    # Report matrices of comparison results for the indicated runs
    # For ANIm, all results other than coverage are symmetric matrices,
    # so we only get results in the forward direction.
    # As we need to pull down the matrices as Pandas dataframes by reading from
    # JSON, we don't bother with a helper function like report(), and write out
    # our matrices directly, here
    if args.run_matrices:
        for run_id in [
                run_id.strip() for run_id in args.run_matrices.split(",")
        ]:
            logger.debug("Extracting matrices for run %s", run_id)
            run = session.query(Run).filter(Run.run_id == run_id).first()
            matlabel_dict = get_matrix_labels_for_run(session, run_id)
            for matdata in [
                    MatrixData(*_) for _ in [
                        ("identity", run.df_identity, {
                            "colour_num": 0.95
                        }),
                        ("coverage", run.df_coverage, {
                            "colour_num": 0.95
                        }),
                        ("aln_lengths", run.df_alnlength, {}),
                        ("sim_errors", run.df_simerrors, {}),
                        ("hadamard", run.df_hadamard, {}),
                    ]
            ]:
                logger.debug("Writing %s results", matdata.name)
                matrix = pd.read_json(matdata.data)
                # Matrix rows and columns are labelled if there's a label dictionary,
                # and take the dataframe index otherwise
                matrix = label_results_matrix(matrix, matlabel_dict)
                pyani_report.write_dbtable(
                    matrix,
                    Path("_".join([
                        str(args.outdir / "matrix"), matdata.name,
                        str(run_id)
                    ])),
                    formats,
                    show_index=True,
                    **matdata.graphic_args,
                )

    return 0
Beispiel #5
0
def subcmd_anib(args: Namespace, logger: Logger) -> None:
    """Perform ANIb on all genome files in an input directory.

    :param args:  Namespace, command-line arguments
    :param logger:  logging object

    Finds ANI by the ANIb method, as described in Goris J, Konstantinidis KT,
    Klappenbach JA, Coenye T, Vandamme P, et al. (2007) DNA-DNA hybridization
    values and their relationship to whole-genome sequence similarities.
    Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0.

    All FASTA format files (selected by suffix) in the input directory are
    fragmented into (by default 1020nt) consecutive sections, and a BLAST+
    database constructed from the whole genome input. The BLAST+ blastn tool
    is then used to query each set of fragments against each BLAST+ database,
    in turn.

    For each query, the BLAST+ .tab output is parsed to obtain alignment length,
    identity and similarity error count. Alignments below a threshold are not
    included in the calculation (this introduces systematic bias with respect to
    ANIm). The results are processed to calculate the ANI percentages, coverage,
    and similarity error.

    The calculated values are stored in the local SQLite3 database.
    """
    logger.info("Running ANIm analysis")  # announce that we're starting

    # Get BLAST+ version - this will be used in the database entries
    blastn_version = anib.get_version(args.blastn_exe)
    logger.info(f"BLAST+ blastn version: {blastn_version}")

    # Use provided name, or make new one for this analysis
    start_time = datetime.datetime.now()
    name = args.name or "_".join(["ANIb", start_time.isoformat()])
    logger.info(f"Analysis name: {name}")

    # Connect to existing database (which may be "clean" or have old analyses)
    logger.info(f"Connecting to database {args.dbpath}")
    try:
        session = get_session(args.dbpath)
    except Exception:
        logger.error(f"Could not connect to database {args.dbpath} (exiting)",
                     exc_info=True)
        raise SystemExit(1)

    # Add information about this run to the database
    logger.info(f"Adding run info to database {args.dbpath}...")
    try:
        run = add_run(
            session,
            method="ANIb",
            cmdline=args.cmdline,
            date=start_time,
            status="started",
            name=name,
        )
    except PyaniORMException:
        logger.error("Could not add run to the database (exiting)",
                     exc_info=True)
        raise SystemExit(1)
    logger.info(f"\t...added run ID: {run} to the database")

    # Identify input files for comparison, and populate the database
    logger.info(f"Adding files for {run} to database...")
    try:
        genome_ids = add_run_genomes(session, run, args.indir, args.classes,
                                     args.labels)
    except PyaniORMException:
        logger.error(
            f"Could not add genomes to database for run {run} (exiting)",
            exc_info=True)
    logger.info(f"\t...added genome IDs: {genome_ids}")

    # Get list of genomes for this analysis from the database
    logger.info("Compiling genomes for comparison")
    genomes = run.genomes.all()
    logger.info(f"\tCollected {len(genomes)} genomes for this run")

    # Create output directories. We create the main parent directory (args.outdir), but
    # also subdirectories for the BLAST databases,
    logger.info(f"Creating output directory {args.outdir}")
    try:
        os.makedirs(args.outdir, exist_ok=True)
    except IOError:
        logger.error(
            f"Could not create output directory {args.outdir} (exiting)",
            exc_info=True)
        raise SystemError(1)
    fragdir = Path(str(args.outdir)) / "fragments"
    blastdbdir = Path(str(args.outdir)) / "blastdbs"
    logger.info(f"\t...creating subdirectories")
    os.makedirs(fragdir, exist_ok=True)
    os.makedirs(blastdbdir, exist_ok=True)

    # Create a new sequence fragment file and a new BLAST+ database for each input genome,
    # and add this data to the database as a row in BlastDB
    logger.info("Creating input sequence fragment files...")
    for genome in genomes:
        fragpath, fraglengths = fragment_fasta_file(Path(str(genome.path)),
                                                    Path(str(fragdir)),
                                                    args.fragsize)
        print(fragpath, len(fraglengths))
        # blastdb = add_blastdb(
        #     session, genome, run, fragpath, dbpath, fraglengths, dbcmd
        # )

    raise NotImplementedError

    # Generate all pair permutations of genome IDs as a list of (Genome, Genome) tuples
    logger.info(
        "Compiling pairwise comparisons (this can take time for large datasets)..."
    )
    comparisons = list(
        permutations(tqdm(genomes, disable=args.disable_tqdm), 2))
    logger.info(
        f"\t...total parwise comparisons to be performed: {len(comparisons)}")

    # Check for existing comparisons; if one has already been done (for the same
    # software package, version, and setting) we add the comparison to this run,
    # but remove it from the list of comparisons to be performed
    logger.info("Checking database for existing comparison data...")
    comparisons_to_run = filter_existing_comparisons(session, run, comparisons,
                                                     "blastn", blastn_version,
                                                     args.fragsize, None)
    logger.info(
        f"\t...after check, still need to run {len(comparisons_to_run)} comparisons"
    )

    # If there are no comparisons to run, update the Run matrices and exit
    # from this function
    if not comparisons_to_run:
        logger.info(
            "All comparison results present in database (skipping comparisons)"
        )
        logger.info("Updating summary matrices with existing results")
        update_comparison_matrices(session, run)
        return

    # If we are in recovery mode, we are salvaging output from a previous
    # run, and do not necessarily need to rerun all the jobs. In this case,
    # we prepare a list of output files we want to recover from the results
    # in the output directory.
    if args.recovery:
        logger.warning("Entering recovery mode...")
        logger.info(
            f"\tIn this mode, existing comparison output from {args.outdir} is reused"
        )
        existingfiles = collect_existing_output(args.outdir, "blastn", args)
        logger.info(
            f"\tIdentified {len(existingfiles)} existing output files for reuse"
        )
    else:
        existingfiles = None
        logger.info(f"\tIdentified no existing output files")

    # Split the input genome files into contiguous fragments of the specified size,
    # as described in Goris et al. We create a new directory to hold sequence
    # fragments, away from the main genomes
    logger.info(
        f"Splitting input genome files into {args.fragsize}nt fragments...")
    fragdir = Path(args.outdir) / "fragments"
    os.makedirs(fragdir, exist_ok=True)
    fragfiles, fraglens = anib.fragment_fasta_files(
        [Path(str(_.path)) for _ in genomes],
        Path(args.outdir) / "fragments",
        args.fragsize,
    )
    logger.info(f"...wrote {len(fragfiles)} fragment files to {fragdir}")

    # Create list of BLASTN jobs for each comparison still to be performed
    logger.info("Creating blastn jobs for ANIb...")
    joblist = generate_joblist(comparisons_to_run, existingfiles, fragfiles,
                               fraglens, args, logger)
    logger.info(f"...created {len(joblist)} blastn jobs")

    raise NotImplementedError
Beispiel #6
0
def subcmd_anim(args: Namespace) -> None:
    """Perform ANIm on all genome files in an input directory.

    :param args:  Namespace, command-line arguments

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (whose path must
    be provided).

    For each pairwise comparison, the NUCmer .delta file output is parsed to
    obtain an alignment length and similarity error count for every unique
    region alignment between the two organisms, as represented by
    sequences in the FASTA files. These are processed to calculated aligned
    sequence lengths, average nucleotide identity (ANI) percentages, coverage
    (aligned percentage of whole genome - forward direction), and similarity
    error count for each pairwise comparison.

    The calculated values are deposited in the SQLite3 database being used for
    the analysis.

    For each pairwise comparison the NUCmer output is stored in the output
    directory for long enough to extract summary information, but for each run
    the output is gzip compressed. Once all runs are complete, the outputs
    for each comparison are concatenated into a single gzip archive.
    """
    # Create logger
    logger = logging.getLogger(__name__)

    # Announce the analysis
    logger.info(termcolor("Running ANIm analysis", bold=True))

    # Get current nucmer version
    nucmer_version = anim.get_version(args.nucmer_exe)
    logger.info(termcolor("MUMMer nucmer version: %s", "cyan"), nucmer_version)

    # Use the provided name or make one for the analysis
    start_time = datetime.datetime.now()
    name = args.name or "_".join(["ANIm", start_time.isoformat()])
    logger.info(termcolor("Analysis name: %s", "cyan"), name)

    # Get connection to existing database. This may or may not have data
    logger.debug("Connecting to database %s", args.dbpath)
    try:
        session = get_session(args.dbpath)
    except Exception:
        logger.error("Could not connect to database %s (exiting)",
                     args.dbpath,
                     exc_info=True)
        raise SystemExit(1)

    # Add information about this run to the database
    logger.debug("Adding run info to database %s...", args.dbpath)
    try:
        run = add_run(
            session,
            method="ANIm",
            cmdline=args.cmdline,
            date=start_time,
            status="started",
            name=name,
        )
    except PyaniORMException:
        logger.error("Could not add run %s to the database (exiting)",
                     run,
                     exc_info=True)
        raise SystemExit(1)
    logger.debug("...added run ID: %s to the database", run)

    # Identify input files for comparison, and populate the database
    logger.debug("Adding genomes for run %s to database...", run)
    try:
        genome_ids = add_run_genomes(session, run, args.indir, args.classes,
                                     args.labels)
    except PyaniORMException:
        logger.error("Could not add genomes to database for run %s (exiting)",
                     run)
        raise SystemExit(1)
    logger.debug("\t...added genome IDs: %s", genome_ids)

    # Generate commandlines for NUCmer analysis and output compression
    logger.info("Generating ANIm command-lines")
    deltadir = args.outdir / pyani_config.ALIGNDIR["ANIm"]
    logger.debug("NUCmer output will be written temporarily to %s", deltadir)

    # Create output directories
    logger.debug("Creating output directory %s", deltadir)
    try:
        deltadir.mkdir(exist_ok=True, parents=True)
    except IOError:
        logger.error("Could not create output directory %s (exiting)",
                     deltadir,
                     exc_info=True)
        raise SystemError(1)

    # Get list of genome IDs for this analysis from the database
    logger.info("Compiling genomes for comparison")
    genomes = run.genomes.all()
    logger.debug("Collected %s genomes for this run", len(genomes))

    # Generate all pair combinations of genome IDs as a list of (Genome, Genome) tuples
    logger.info(
        "Compiling pairwise comparisons (this can take time for large datasets)..."
    )
    comparisons = list(
        combinations(tqdm(genomes, disable=args.disable_tqdm), 2))
    logger.info("\t...total parwise comparisons to be performed: %s",
                len(comparisons))

    # Check for existing comparisons; if one has been done (for the same
    # software package, version, and setting) we add the comparison to this run,
    # but remove it from the list of comparisons to be performed
    logger.info("Checking database for existing comparison data...")
    comparisons_to_run = filter_existing_comparisons(session, run, comparisons,
                                                     "nucmer", nucmer_version,
                                                     None, args.maxmatch)
    logger.info("\t...after check, still need to run %s comparisons",
                len(comparisons_to_run))

    # If there are no comparisons to run, update the Run matrices and exit
    # from this function
    if not comparisons_to_run:
        logger.info(
            termcolor(
                "All comparison results present in database (skipping comparisons)",
                "magenta",
            ))
        logger.info("Updating summary matrices with existing results")
        update_comparison_matrices(session, run)
        return

    # If we are in recovery mode, we are salvaging output from a previous
    # run, and do not necessarily need to rerun all the jobs. In this case,
    # we prepare a list of output files we want to recover from the results
    # in the output directory.
    if args.recovery:
        logger.warning("Entering recovery mode")
        logger.debug(
            "\tIn this mode, existing comparison output from %s is reused",
            deltadir)
        existingfiles = collect_existing_output(deltadir, "nucmer", args)
        logger.debug("\tIdentified %s existing output files for reuse",
                     len(existingfiles))
    else:
        existingfiles = list()
        logger.debug("\tIdentified no existing output files")

    # Create list of NUCmer jobs for each comparison still to be performed
    logger.info("Creating NUCmer jobs for ANIm")
    joblist = generate_joblist(comparisons_to_run, existingfiles, args)
    logger.debug("Generated %s jobs, %s comparisons", len(joblist),
                 len(comparisons_to_run))

    # Pass jobs to appropriate scheduler
    logger.debug("Passing %s jobs to %s...", len(joblist), args.scheduler)
    run_anim_jobs(joblist, args)
    logger.info("...jobs complete")

    # Process output and add results to database
    # This requires us to drop out of threading/multiprocessing: Python's SQLite3
    # interface doesn't allow sharing connections and cursors
    logger.info("Adding comparison results to database...")
    update_comparison_results(joblist, run, session, nucmer_version, args)
    update_comparison_matrices(session, run)
    logger.info("...database updated.")