def press_hmms(self) -> None: """hmmpress markers hmm database files. Returns ------- NoneType """ hmm_search_str = os.path.join(self.markers_dir, "*.h3?") # First search for pressed hmms to remove from list to hmmpress pressed_hmms = { os.path.realpath(os.path.splitext(fp)[0]) for fp in glob(hmm_search_str) if not fp.endswith(".md5") } # Now retrieve all hmms in markers directory hmms = ( os.path.join(self.markers_dir, fn) for fn in os.listdir(self.markers_dir) if fn.endswith(".hmm") ) # Filter by hmms not already pressed hmms = (fpath for fpath in hmms if fpath not in pressed_hmms) # Press hmms and write checksums of their indices for hmm_fp in hmms: hmmscan.hmmpress(hmm_fp) for index_fp in glob(f"{hmm_fp}.h3?"): write_checksum(index_fp, f"{index_fp}.md5")
def download_markers(self, options: Iterable) -> None: """Download markers database files and amend user config to reflect this. Parameters ---------- options : iterable iterable containing options in 'markers' section to download. Returns ------- NoneType Will update provided `options` in `self.config`. Raises ------- ConnectionError marker file download failed. """ for option in options: # First retrieve the markers file url from `option` in `markers` url = self.config.get("database_urls", option) if self.config.has_option("markers", option): outfpath = self.config.get("markers", option) else: outfname = os.path.basename(url) outfpath = os.path.join(self.markers_dir, outfname) if self.dryrun: logger.debug(f"UPDATE: (markers,{option}): {outfpath}") self.config.set("markers", option, outfpath) continue # Retrieve markers file and write contents to `outfpath` with requests.Session() as session, open(outfpath, "w") as fh: resp = session.get(url) if not resp.ok: raise ConnectionError(f"Failed to retrieve {url}") fh.write(resp.text) self.config.set("markers", option, outfpath) checksum_outfpath = f"{outfpath}.md5" write_checksum(outfpath, checksum_outfpath) current_checksum = read_checksum(checksum_outfpath) current_hash, __ = current_checksum.split() remote_checksum = self.get_remote_checksum("markers", option) remote_hash, __ = remote_checksum.split() if current_hash != remote_hash: raise ChecksumMismatchError(f"{option} download failed") self.press_hmms()
def extract_taxdump(self) -> None: """Extract autometa required files from ncbi taxdump.tar.gz archive into ncbi databases directory and update user config with extracted paths. This only extracts nodes.dmp, names.dmp and merged.dmp from taxdump.tar.gz if the files do not already exist. If `update` was originally supplied as `True` to the Databases instance, then the previous files will be replaced by the new taxdump files. After successful extraction of the files, a checksum will be written of the archive for future checking. Returns ------- NoneType Will update `self.config` section `ncbi` with options 'nodes', 'names','merged' """ taxdump_fpath = self.config.get("ncbi", "taxdump") taxdump_files = [ ("nodes", "nodes.dmp"), ("names", "names.dmp"), ("merged", "merged.dmp"), ] for option, fname in taxdump_files: outfpath = os.path.join(self.ncbi_dir, fname) if self.dryrun: logger.debug(f"UPDATE (ncbi,{option}): {outfpath}") self.config.set("ncbi", option, outfpath) continue # Only update the taxdump files if the user says to do an update. if self.update and os.path.exists(outfpath): os.remove(outfpath) # Only extract the taxdump files if this is not a "dryrun" if not os.path.exists(outfpath): outfpath = untar(taxdump_fpath, self.ncbi_dir, fname) write_checksum(outfpath, f"{outfpath}.md5") logger.debug(f"UPDATE (ncbi,{option}): {outfpath}") self.config.set("ncbi", option, outfpath)
def download_ncbi_files(self, options: Iterable) -> None: """Download NCBI database files. Parameters ---------- options : iterable iterable containing options in 'ncbi' section to download. Returns ------- NoneType Will update provided `options` in `self.config`. Raises ------- subprocess.CalledProcessError NCBI file download with rsync failed. ConnectionError NCBI file checksums do not match after file transfer. """ # s.t. set methods are available options = set(options) # If any of the taxdump.tar.gz files are missing, # we need to check that taxdump tarball is available to extract them (see self.extract_taxdump). for taxdump_option in {"nodes", "names", "merged"}: if taxdump_option in options: options.add("taxdump") options.discard(taxdump_option) for option in options: ftp_fullpath = self.config.get("database_urls", option) if (self.config.has_option("ncbi", option) and self.config.get("ncbi", option) is not None): outfpath = self.config.get("ncbi", option) else: outfname = os.path.basename(ftp_fullpath) outfpath = os.path.join(self.ncbi_dir, outfname) logger.debug(f"UPDATE: (ncbi,{option}): {outfpath}") self.config.set("ncbi", option, outfpath) if self.dryrun: return rsync_fpath = ftp_fullpath.replace("ftp", "rsync", 1) cmd = ["rsync", "--quiet", "--archive", rsync_fpath, outfpath] logger.debug(f"starting {option} download") subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) checksum_outfpath = f"{outfpath}.md5" write_checksum(outfpath, checksum_outfpath) current_checksum = read_checksum(checksum_outfpath) current_hash, __ = current_checksum.split() remote_checksum = self.get_remote_checksum("ncbi", option) remote_hash, __ = remote_checksum.split() if current_hash != remote_hash: raise ChecksumMismatchError(f"{option} download failed") if "taxdump" in options: self.extract_taxdump() if "nr" in options: self.format_nr()
def format_nr(self) -> None: """Construct a diamond formatted database (nr.dmnd) from `nr` option in `ncbi` section in user config. NOTE: The checksum 'nr.dmnd.md5' will only be generated if nr.dmnd construction is successful. If the provided `nr` option in `ncbi` is 'nr.gz' the database will be removed after successful database formatting. Returns ------- NoneType config updated option:'nr' in section:'ncbi'. """ db_infpath = self.config.get("ncbi", "nr") db_infpath_md5 = f"{db_infpath}.md5" db_outfpath = db_infpath.replace(".gz", ".dmnd") db_outfpath_exists = os.path.exists(db_outfpath) if db_outfpath_exists: db_outfpath_hash, __ = calc_checksum(db_outfpath).split() remote_checksum_matches = False current_nr_checksum_matches = False # Check database and database checksum is up-to-date if os.path.exists(db_infpath_md5) and db_outfpath_exists: # Check if the current db md5 is up-to-date with the remote db md5 current_hash, __ = read_checksum(db_infpath_md5).split() remote_hash, __ = self.get_remote_checksum("ncbi", "nr").split() if remote_hash == current_hash: remote_checksum_matches = True # Check if the current db md5 matches the calc'd db checksum if db_outfpath_hash == current_hash: current_nr_checksum_matches = True db_outfpath_md5 = f"{db_outfpath}.md5" db_outfpath_md5_checksum_matches = False if os.path.exists(db_outfpath_md5) and db_outfpath_exists: db_outfpath_md5_hash, __ = read_checksum(db_outfpath_md5).split() if db_outfpath_hash == db_outfpath_md5_hash: db_outfpath_md5_checksum_matches = True checksum_checks = ["nr.dmnd.md5", "nr.gz.md5", "remote nr.gz.md5"] checksum_matches = [ db_outfpath_md5_checksum_matches, current_nr_checksum_matches, remote_checksum_matches, ] for checksum_match, checksum_check in zip(checksum_matches, checksum_checks): # If the checksums do not match, we need to update the database file. if checksum_match: logger.debug(f"{checksum_check} checksum matches, skipping...") self.config.set("ncbi", "nr", db_outfpath) logger.debug(f"set ncbi nr: {db_outfpath}") return # Only update out-of-date db files if user wants to update via self.update if not self.update and checksum_check == "remote nr.gz.md5": return diamond.makedatabase(fasta=db_infpath, database=db_outfpath, nproc=self.nproc) # Write checksum for nr.dmnd write_checksum(db_outfpath, db_outfpath_md5) if os.path.basename(db_infpath) == "nr.gz": # nr.gz will be removed after successful nr.dmnd construction os.remove(db_infpath) self.config.set("ncbi", "nr", db_outfpath) logger.debug(f"set ncbi nr: {db_outfpath}")