Ejemplo n.º 1
0
def subcmd_index(args: Namespace, logger: Logger) -> int:
    """Generate a file with the MD5 hash for each genome in an input directory.

    :param args:  Namespace, received command-line arguments
    :param logger:  logging object

    Identify the genome files in the input directory, and generate a single
    MD5 for each so that <genome>.fna produces <genome>.md5

    Genome files (FASTA) are identified from the file extension.
    """
    # Get list of FASTA files in the input directory
    logger.info("Scanning directory %s for FASTA files", args.indir)
    fpaths = pyani_files.get_fasta_paths(args.indir)
    logger.info("Found FASTA files:")
    logger.info([f"\t{fpath}\n" for fpath in fpaths])

    # Lists of class/label information
    classes = []
    labels = []

    # Create MD5 hash for each file, if needed
    for fpath in fpaths:
        hashfname = fpath.with_suffix(".md5")
        if hashfname.is_file():
            logger.info("%s already indexed (using existing hash)", fpath)
            with open(hashfname, "r") as ifh:
                datahash = ifh.readline().split()[0]
        else:
            # Write an .md5 hash file
            datahash = download.create_hash(fpath)
            logger.info("Writing hash to %s", hashfname)
            with open(hashfname, "w") as hfh:
                hfh.write(f"{datahash}\t{fpath}\n")

        # Parse the file and get the label/class information
        with open(fpath, "r") as sfh:
            label = list(SeqIO.parse(sfh,
                                     "fasta"))[0].description.split(" ", 1)[-1]
        labels.append("\t".join([datahash, fpath.stem, label]))
        classes.append("\t".join([datahash, fpath.stem, label]))

    # Write class and label files
    classfname = args.indir / args.classfname
    logger.info("Writing classes file to %s", classfname)
    if classfname.exists():
        logger.warning("Class file %s exists, not overwriting", classfname)
    else:
        with open(classfname, "w") as ofh:
            ofh.write("\n".join(classes) + "\n")

    labelfname = args.indir / args.labelfname
    logger.info("Writing labels file to %s", labelfname)
    if labelfname.exists():
        logger.warning("Labels file %s exists, not overwriting", labelfname)
    else:
        with open(labelfname, "w") as ofh:
            ofh.write("\n".join(labels) + "\n")

    return 0
Ejemplo n.º 2
0
def hash_genomes(args: Namespace, dlstatus: download.DLStatus, filestem: str,
                 uid_class) -> Tuple[str, str]:
    """Hash genome files in passed dlstatus.

    :param args:  Namespace of command-line arguments
    :param dlstatus:
    :param filestem:  str, filestem for output
    :param uid_class:
    """
    logger = logging.getLogger(__name__)

    # Create MD5 hash for the downloaded contigs
    ename = dlstatus.outfname.with_suffix("")  # should strip only last suffix
    logger.debug("Creating local MD5 hash for %s", ename)
    hashfname = ename.with_suffix(".md5")
    datahash = download.create_hash(ename)
    logger.debug("Writing hash to %s", hashfname)
    with open(hashfname, "w") as hfh:
        hfh.write("\t".join([datahash, str(ename)]) + "\n")
    # Make label/class text
    labeltxt, classtxt = download.create_labels(uid_class, filestem, datahash)
    return labeltxt, classtxt
def get_ncbi_asm(args, asm_uid, fmt="fasta"):
    """Return the NCBI AssemblyAccession and AssemblyName for an assembly.

    :param args:  Namespace, command-line arguments
    :param asm_uid:  NCBI assembly UID
    :param fmt:  str, format to retrieve assembly information

    Returns organism data for class/label files also, as well
    as accession, so we can track whether downloads fail because only the
    most recent version is available..

    AssemblyAccession and AssemblyName are data fields in the eSummary record,
    and correspond to downloadable files for each assembly at
    ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GC[AF]/nnn/nnn/nnn/<AA>_<AN>
    where <AA> is AssemblyAccession, and <AN> is AssemblyName, and the choice
    of GCA vs GCF, and the three values of nnn are taken from <AA>
    """
    logger = logging.getLogger(__name__)

    logger.info("Identifying assembly information from NCBI for %s", asm_uid)

    # Obtain full eSummary data for the assembly
    summary = Entrez.read(
        entrez_retry(args, Entrez.esummary, db="assembly", id=asm_uid, report="full"),
        validate=False,
    )

    # Extract filestem from assembly data
    data = summary["DocumentSummarySet"]["DocumentSummary"][0]
    filestem = extract_filestem(data)

    # Report interesting things from the summary for those interested
    logger.info("\tOrganism: %s", data["Organism"])
    logger.info("\tTaxid: %s", data["SpeciesTaxid"])
    logger.info("\tAccession: %s", data["AssemblyAccession"])
    logger.info("\tName: %s", data["AssemblyName"])
    # NOTE: Maybe parse out the assembly stats here, in future?

    # Get class and label text
    organism = data["SpeciesName"]
    try:
        strain = data["Biosource"]["InfraspeciesList"][0]["Sub_value"]
    except (KeyError, IndexError):
        # we consider this an error/incompleteness in the NCBI metadata
        strain = ""

    # Download and extract genome assembly
    hash_md5 = None
    try:
        fastafname = retrieve_asm_contigs(args, filestem, fmt=fmt)
        hash_md5 = create_hash(fastafname)
    except NCBIDownloadException:
        # This is a little hacky. Sometimes, RefSeq assemblies are
        # suppressed (presumably because they are non-redundant),
        # but the GenBank assembly persists. In those cases, we
        # *assume* (because it may not be true) that the corresponding
        # genbank sequence shares the same accession number, except
        # that GCF is replaced by GCA
        gbfilestem = re.sub("^GCF_", "GCA_", filestem)
        logger.warning("Could not download %s, trying %s", filestem, gbfilestem)
        try:
            fastafname = retrieve_asm_contigs(args, gbfilestem, fmt=fmt)
            hash_md5 = create_hash(fastafname)
        except NCBIDownloadException:
            fastafname = None

    # Create label and class strings
    genus, species = organism.split(" ", 1)
    lbltxt = "%s\t%s_genomic\t%s %s %s" % (
        hash_md5,
        filestem,
        genus[0] + ".",
        species,
        strain,
    )
    clstxt = "%s\t%s_genomic\t%s" % (hash_md5, filestem, organism)
    logger.info("\tLabel: %s", lbltxt)
    logger.info("\tClass: %s", clstxt)

    return (fastafname, clstxt, lbltxt, data["AssemblyAccession"])
Ejemplo n.º 4
0
def subcmd_download(args: Namespace, logger: Logger) -> int:
    """Download assembled genomes in subtree of passed NCBI taxon ID.

    :param args:  Namespace, command-line arguments
    :param logger:  logging object
    """
    # Create output directory, respecting force/noclobber
    if not args.dryrun:
        tools.make_outdir(args.outdir, args.force, args.noclobber, logger)
    else:
        logger.warning("Dry run only: will not overwrite or download")

    # Set Entrez email
    download.set_ncbi_email(args.email)
    logger.info(f"Setting Entrez email address: {args.email}")

    # Parse Entrez API key, if provided
    api_path = args.api_keypath.expanduser()
    if not api_path.is_file():
        logger.warning(
            f"API path {api_path} not a valid file. Not using API key.")
        api_key = None
    else:
        api_key = download.parse_api_key(api_path)
        logger.info(f"API key recovered from {api_path}")

    # Get list of taxon IDs to download
    taxon_ids = download.split_taxa(args.taxon)
    logger.info(f"Taxon IDs received: {taxon_ids}")

    # Get assembly UIDs for each taxon
    asm_dict = tools.make_asm_dict(taxon_ids, args.retries)
    for tid, uids in asm_dict.items():
        logger.info(
            f"Taxon ID summary\n\tQuery: {tid}\n\tasm count: {len(uids)}\n\tUIDs: {uids}"
        )

    # Compile outputs to write class and label files, and a list of
    # skipped downloads (and define a helper tuple for collating skipped
    # genome information)
    classes = []
    labels = []
    skippedlist = []
    Skipped = namedtuple("Skipped",
                         "taxon_id accession organism strain url dltype")

    # Download contigs and hashes for each assembly UID in the list
    # On completion of this loop, each assembly in the list will either be
    # downloaded or skipped (with skipped genome information preserved in
    # skippedlist), and class/label info will be collated, ready for writing
    # to file.
    # Summary information is reported to the logger for each eSummary that
    # can be recovered
    for tid, uids in asm_dict.items():
        logger.info(f"Downloading contigs for Taxon ID {tid}")
        for uid in uids:
            # Obtain eSummary
            logger.info(f"Get eSummary information for UID {uid}")
            esummary, filestem = download.get_ncbi_esummary(
                uid, args.retries, api_key)
            uid_class = download.get_ncbi_classification(esummary)

            # Report summary
            outstr = "\n\t".join([
                f"Species Taxid: {esummary['SpeciesTaxid']}",
                f"TaxID: {esummary['Taxid']}",
                f"Accession: {esummary['AssemblyAccession']}",
                f"Name: {esummary['AssemblyName']}",
                f"Organism: {uid_class.organism}",
                f"Genus: {uid_class.genus}",
                f"Species: {uid_class.species}",
                f"Strain: {uid_class.strain}",
            ])
            logger.info(f"eSummary information:\n\t{outstr}")
            if args.dryrun:
                logger.warning(
                    f"(dry-run) skipping download of {esummary['AssemblyAccession']}"
                )
                continue

            # Obtain URLs, trying the RefSeq filestem first, then GenBank if
            # there's a failure
            dlfiledata = tools.DLFileData(
                filestem, "ftp://ftp.ncbi.nlm.nih.gov/genomes/all",
                "genomic.fna.gz")
            logger.info(f"Retrieving URLs for {filestem}")
            # Try RefSeq first
            dlstatus = tools.download_genome_and_hash(
                args,
                logger,
                dlfiledata,
                dltype="RefSeq",
                disable_tqdm=args.disable_tqdm,
            )
            # RefSeq failed, try GenBank
            # Pylint is confused by the content of dlstatus (a namedlist)
            if dlstatus.skipped:  # pylint: disable=no-member
                skippedlist.append(
                    Skipped(
                        tid,
                        uid,
                        uid_class.organism,
                        uid_class.strain,
                        dlstatus.url,  # pylint: disable=no-member
                        "RefSeq",
                    ))
                logger.warning(
                    "RefSeq failed. Trying GenBank alternative assembly")
                # Try GenBank assembly
                dlstatus = tools.download_genome_and_hash(
                    args,
                    logger,
                    dlfiledata,
                    dltype="GenBank",
                    disable_tqdm=args.disable_tqdm,
                )
                # Pylint is confused by the content of dlstatus (a namedlist)
                if dlstatus.skipped:  # pylint: disable=no-member
                    skippedlist.append(
                        Skipped(
                            tid,
                            uid,
                            uid_class.organism,
                            uid_class.strain,
                            dlstatus.url,
                            "GenBank",
                        ))
                    logger.warning("GenBank failed.")
                    continue  # Move straight on to the next download

            # One of the downloads worked: report information
            logger.info(f"Downloaded from URL: {dlstatus.url}")
            logger.info(f"Wrote assembly to: {dlstatus.outfname}")
            logger.info(f"Wrote MD5 hashes to: {dlstatus.outfhash}")

            # Check hash for the download
            hashstatus = download.check_hash(dlstatus.outfname,
                                             dlstatus.outfhash)
            logger.info(f"Local MD5 hash: {hashstatus.localhash}")
            logger.info(f"NCBI MD5 hash: {hashstatus.filehash}")
            if hashstatus.passed:
                logger.info("MD5 hash check passed")
            else:
                logger.warning(
                    "MD5 hash check failed. Please check and retry.")

            # Extract downloaded files
            ename = dlstatus.outfname.with_suffix(
                "")  # should strip only last suffix
            if ename.exists() and args.noclobber:
                logger.warning(f"Output file {ename} exists, not extracting")
            else:
                logger.info(
                    f"Extracting archive {dlstatus.outfname} to {ename}")
                download.extract_contigs(dlstatus.outfname, ename)

            # Modify sequence ID header if Kraken option active
            if args.kraken:
                logger.warning(
                    "Modifying downloaded sequence for Kraken compatibility")
                seqdata = list(SeqIO.parse(ename, "fasta"))
                logger.info(f"Modifying {ename}")
                for seq in seqdata:
                    seq.id = "|".join(
                        [seq.id, "kraken:taxid", esummary["SpeciesTaxid"]])
                SeqIO.write(seqdata, ename, "fasta")

            # Create MD5 hash for the downloaded contigs
            logger.info(f"Creating local MD5 hash for {ename}")
            hashfname = ename.with_suffix(".md5")
            datahash = download.create_hash(ename)
            logger.info("Writing hash to %s" % hashfname)
            with open(hashfname, "w") as hfh:
                hfh.write("\t".join([datahash, str(ename)]) + "\n")
            # Make label/class text
            labeltxt, classtxt = download.create_labels(
                uid_class, filestem, datahash)
            classes.append(classtxt)
            labels.append(labeltxt)
            logger.info(
                f"Label and class file entries\n\tLabel: {labeltxt}\n\tClass: {classtxt}"
            )

    # Write class and label files
    classfname = args.outdir / args.classfname
    logger.info(f"Writing classes file to {classfname}")
    if classfname.exists() and args.noclobber:
        logger.warning(f"Class file {classfname} exists, not overwriting")
    else:
        with open(classfname, "w") as ofh:
            ofh.write("\n".join(classes) + "\n")

    labelfname = args.outdir / args.labelfname
    logger.info(f"Writing labels file to {labelfname}")
    if labelfname.exists() and args.noclobber:
        logger.warning(f"Labels file {labelfname} exists, not overwriting")
    else:
        with open(labelfname, "w") as ofh:
            ofh.write("\n".join(labels) + "\n")

    # Report skipped genome list
    if skippedlist:
        logger.warning(f"{len(skippedlist)} genome downloads were skipped")
        for skipped in skippedlist:
            outstr = "\n\t".join([
                f"taxon id: {skipped.taxon_id}",
                f"accession: {skipped.accession}",
                f"URL: {skipped.url}",
                f"source: {skipped.dltype}",
            ])
            logger.warning(f"{skipped.organism} {skipped.strain}:\n\t{outstr}")

    return 0