Ejemplo n.º 1
0
    def press_hmms(self) -> None:
        """hmmpress markers hmm database files.

        Returns
        -------
        NoneType

        """
        hmm_search_str = os.path.join(self.markers_dir, "*.h3?")
        # First search for pressed hmms to remove from list to hmmpress
        pressed_hmms = {
            os.path.realpath(os.path.splitext(fp)[0])
            for fp in glob(hmm_search_str)
            if not fp.endswith(".md5")
        }
        # Now retrieve all hmms in markers directory
        hmms = (
            os.path.join(self.markers_dir, fn)
            for fn in os.listdir(self.markers_dir)
            if fn.endswith(".hmm")
        )
        # Filter by hmms not already pressed
        hmms = (fpath for fpath in hmms if fpath not in pressed_hmms)
        # Press hmms and write checksums of their indices
        for hmm_fp in hmms:
            hmmscan.hmmpress(hmm_fp)
            for index_fp in glob(f"{hmm_fp}.h3?"):
                write_checksum(index_fp, f"{index_fp}.md5")
Ejemplo n.º 2
0
    def download_markers(self, options: Iterable) -> None:
        """Download markers database files and amend user config to reflect this.

        Parameters
        ----------
        options : iterable
            iterable containing options in 'markers' section to download.

        Returns
        -------
        NoneType
            Will update provided `options` in `self.config`.

        Raises
        -------
        ConnectionError
            marker file download failed.

        """
        for option in options:
            # First retrieve the markers file url from `option` in `markers`
            url = self.config.get("database_urls", option)
            if self.config.has_option("markers", option):
                outfpath = self.config.get("markers", option)
            else:
                outfname = os.path.basename(url)
                outfpath = os.path.join(self.markers_dir, outfname)

            if self.dryrun:
                logger.debug(f"UPDATE: (markers,{option}): {outfpath}")
                self.config.set("markers", option, outfpath)
                continue

            # Retrieve markers file and write contents to `outfpath`
            with requests.Session() as session, open(outfpath, "w") as fh:
                resp = session.get(url)
                if not resp.ok:
                    raise ConnectionError(f"Failed to retrieve {url}")
                fh.write(resp.text)
            self.config.set("markers", option, outfpath)
            checksum_outfpath = f"{outfpath}.md5"
            write_checksum(outfpath, checksum_outfpath)
            current_checksum = read_checksum(checksum_outfpath)
            current_hash, __ = current_checksum.split()
            remote_checksum = self.get_remote_checksum("markers", option)
            remote_hash, __ = remote_checksum.split()
            if current_hash != remote_hash:
                raise ChecksumMismatchError(f"{option} download failed")
        self.press_hmms()
Ejemplo n.º 3
0
    def extract_taxdump(self) -> None:
        """Extract autometa required files from ncbi taxdump.tar.gz archive
        into ncbi databases directory and update user config with extracted
        paths.

        This only extracts nodes.dmp, names.dmp and merged.dmp from
        taxdump.tar.gz if the files do not already exist. If `update`
        was originally supplied as `True` to the Databases instance, then the
        previous files will be replaced by the new taxdump files.

        After successful extraction of the files, a checksum will be written
        of the archive for future checking.

        Returns
        -------
        NoneType
            Will update `self.config` section `ncbi` with options 'nodes',
            'names','merged'

        """
        taxdump_fpath = self.config.get("ncbi", "taxdump")
        taxdump_files = [
            ("nodes", "nodes.dmp"),
            ("names", "names.dmp"),
            ("merged", "merged.dmp"),
        ]
        for option, fname in taxdump_files:
            outfpath = os.path.join(self.ncbi_dir, fname)
            if self.dryrun:
                logger.debug(f"UPDATE (ncbi,{option}): {outfpath}")
                self.config.set("ncbi", option, outfpath)
                continue
            # Only update the taxdump files if the user says to do an update.
            if self.update and os.path.exists(outfpath):
                os.remove(outfpath)
            # Only extract the taxdump files if this is not a "dryrun"
            if not os.path.exists(outfpath):
                outfpath = untar(taxdump_fpath, self.ncbi_dir, fname)
            write_checksum(outfpath, f"{outfpath}.md5")

            logger.debug(f"UPDATE (ncbi,{option}): {outfpath}")
            self.config.set("ncbi", option, outfpath)
Ejemplo n.º 4
0
    def download_ncbi_files(self, options: Iterable) -> None:
        """Download NCBI database files.

        Parameters
        ----------
        options : iterable
            iterable containing options in 'ncbi' section to download.

        Returns
        -------
        NoneType
            Will update provided `options` in `self.config`.

        Raises
        -------
        subprocess.CalledProcessError
            NCBI file download with rsync failed.
        ConnectionError
            NCBI file checksums do not match after file transfer.

        """
        # s.t. set methods are available
        options = set(options)
        # If any of the taxdump.tar.gz files are missing,
        # we need to check that taxdump tarball is available to extract them (see self.extract_taxdump).
        for taxdump_option in {"nodes", "names", "merged"}:
            if taxdump_option in options:
                options.add("taxdump")
                options.discard(taxdump_option)
        for option in options:
            ftp_fullpath = self.config.get("database_urls", option)

            if (self.config.has_option("ncbi", option)
                    and self.config.get("ncbi", option) is not None):
                outfpath = self.config.get("ncbi", option)
            else:
                outfname = os.path.basename(ftp_fullpath)
                outfpath = os.path.join(self.ncbi_dir, outfname)

            logger.debug(f"UPDATE: (ncbi,{option}): {outfpath}")
            self.config.set("ncbi", option, outfpath)

            if self.dryrun:
                return

            rsync_fpath = ftp_fullpath.replace("ftp", "rsync", 1)
            cmd = ["rsync", "--quiet", "--archive", rsync_fpath, outfpath]
            logger.debug(f"starting {option} download")
            subprocess.run(cmd,
                           stdout=subprocess.DEVNULL,
                           stderr=subprocess.DEVNULL,
                           check=True)
            checksum_outfpath = f"{outfpath}.md5"
            write_checksum(outfpath, checksum_outfpath)
            current_checksum = read_checksum(checksum_outfpath)
            current_hash, __ = current_checksum.split()
            remote_checksum = self.get_remote_checksum("ncbi", option)
            remote_hash, __ = remote_checksum.split()
            if current_hash != remote_hash:
                raise ChecksumMismatchError(f"{option} download failed")
        if "taxdump" in options:
            self.extract_taxdump()
        if "nr" in options:
            self.format_nr()
Ejemplo n.º 5
0
    def format_nr(self) -> None:
        """Construct a diamond formatted database (nr.dmnd) from `nr` option
        in `ncbi` section in user config.

        NOTE: The checksum 'nr.dmnd.md5' will only be generated if nr.dmnd
        construction is successful. If the provided `nr` option in `ncbi` is
        'nr.gz' the database will be removed after successful database
        formatting.

        Returns
        -------
        NoneType
            config updated option:'nr' in section:'ncbi'.

        """
        db_infpath = self.config.get("ncbi", "nr")
        db_infpath_md5 = f"{db_infpath}.md5"
        db_outfpath = db_infpath.replace(".gz", ".dmnd")

        db_outfpath_exists = os.path.exists(db_outfpath)
        if db_outfpath_exists:
            db_outfpath_hash, __ = calc_checksum(db_outfpath).split()

        remote_checksum_matches = False
        current_nr_checksum_matches = False
        # Check database and database checksum is up-to-date
        if os.path.exists(db_infpath_md5) and db_outfpath_exists:
            # Check if the current db md5 is up-to-date with the remote db md5
            current_hash, __ = read_checksum(db_infpath_md5).split()
            remote_hash, __ = self.get_remote_checksum("ncbi", "nr").split()
            if remote_hash == current_hash:
                remote_checksum_matches = True
            # Check if the current db md5 matches the calc'd db checksum
            if db_outfpath_hash == current_hash:
                current_nr_checksum_matches = True

        db_outfpath_md5 = f"{db_outfpath}.md5"
        db_outfpath_md5_checksum_matches = False
        if os.path.exists(db_outfpath_md5) and db_outfpath_exists:
            db_outfpath_md5_hash, __ = read_checksum(db_outfpath_md5).split()
            if db_outfpath_hash == db_outfpath_md5_hash:
                db_outfpath_md5_checksum_matches = True

        checksum_checks = ["nr.dmnd.md5", "nr.gz.md5", "remote nr.gz.md5"]
        checksum_matches = [
            db_outfpath_md5_checksum_matches,
            current_nr_checksum_matches,
            remote_checksum_matches,
        ]
        for checksum_match, checksum_check in zip(checksum_matches,
                                                  checksum_checks):
            # If the checksums do not match, we need to update the database file.
            if checksum_match:
                logger.debug(f"{checksum_check} checksum matches, skipping...")
                self.config.set("ncbi", "nr", db_outfpath)
                logger.debug(f"set ncbi nr: {db_outfpath}")
                return
            # Only update out-of-date db files if user wants to update via self.update
            if not self.update and checksum_check == "remote nr.gz.md5":
                return

        diamond.makedatabase(fasta=db_infpath,
                             database=db_outfpath,
                             nproc=self.nproc)
        # Write checksum for nr.dmnd
        write_checksum(db_outfpath, db_outfpath_md5)

        if os.path.basename(db_infpath) == "nr.gz":
            # nr.gz will be removed after successful nr.dmnd construction
            os.remove(db_infpath)

        self.config.set("ncbi", "nr", db_outfpath)
        logger.debug(f"set ncbi nr: {db_outfpath}")