def download_markers(self, options: Iterable) -> None: """Download markers database files and amend user config to reflect this. Parameters ---------- options : iterable iterable containing options in 'markers' section to download. Returns ------- NoneType Will update provided `options` in `self.config`. Raises ------- ConnectionError marker file download failed. """ for option in options: # First retrieve the markers file url from `option` in `markers` url = self.config.get("database_urls", option) if self.config.has_option("markers", option): outfpath = self.config.get("markers", option) else: outfname = os.path.basename(url) outfpath = os.path.join(self.markers_dir, outfname) if self.dryrun: logger.debug(f"UPDATE: (markers,{option}): {outfpath}") self.config.set("markers", option, outfpath) continue # Retrieve markers file and write contents to `outfpath` with requests.Session() as session, open(outfpath, "w") as fh: resp = session.get(url) if not resp.ok: raise ConnectionError(f"Failed to retrieve {url}") fh.write(resp.text) self.config.set("markers", option, outfpath) checksum_outfpath = f"{outfpath}.md5" write_checksum(outfpath, checksum_outfpath) current_checksum = read_checksum(checksum_outfpath) current_hash, __ = current_checksum.split() remote_checksum = self.get_remote_checksum("markers", option) remote_hash, __ = remote_checksum.split() if current_hash != remote_hash: raise ChecksumMismatchError(f"{option} download failed") self.press_hmms()
def compare_checksums(self, section: str = None) -> Dict[str, Dict]: """Get all invalid database files in `options` from `section` in config. An md5 checksum comparison will be performed between the current and file's remote md5 to ensure file integrity prior to checking the respective file as valid. Parameters ---------- section : str, optional Configure provided `section` Choices include 'markers' and 'ncbi'. (default will download/format all database directories) Returns ------- dict {section:{option, option,...}, section:{...}, ...} """ sections = [section] if section else Databases.SECTIONS.keys() invalid = {} taxdump_checked = False for section in sections: for option in self.config.options(section): if option not in Databases.SECTIONS.get(section): # Skip user added options not required by Autometa continue # nodes.dmp, names.dmp and merged.dmp are all in taxdump.tar.gz option = "taxdump" if option in {"nodes", "names", "merged" } else option fpath = self.config.get(section, option) fpath_md5 = f"{fpath}.md5" # We can not checksum a file that does not exist. if not os.path.exists(fpath) and not os.path.exists(fpath_md5): continue # To not waste time checking the taxdump files 3 times. if option == "taxdump" and taxdump_checked: continue if os.path.exists(fpath_md5): current_checksum = read_checksum(fpath_md5) else: current_checksum = calc_checksum(fpath) current_hash, __ = current_checksum.split() try: remote_checksum = self.get_remote_checksum(section, option) remote_hash, __ = remote_checksum.split() except ConnectionError as err: # Do not mark file as invalid if a connection error occurs. logger.warning(err) continue if option == "taxdump": taxdump_checked = True if remote_hash == current_hash: logger.debug(f"{option} checksums match, skipping...") continue if section in invalid: invalid[section].add(option) else: invalid.update({section: set([option])}) # Log invalid options for section, options in invalid.items(): for option in options: logger.debug(f"INVALID: ({section},{option})") return invalid
def download_ncbi_files(self, options: Iterable) -> None: """Download NCBI database files. Parameters ---------- options : iterable iterable containing options in 'ncbi' section to download. Returns ------- NoneType Will update provided `options` in `self.config`. Raises ------- subprocess.CalledProcessError NCBI file download with rsync failed. ConnectionError NCBI file checksums do not match after file transfer. """ # s.t. set methods are available options = set(options) # If any of the taxdump.tar.gz files are missing, # we need to check that taxdump tarball is available to extract them (see self.extract_taxdump). for taxdump_option in {"nodes", "names", "merged"}: if taxdump_option in options: options.add("taxdump") options.discard(taxdump_option) for option in options: ftp_fullpath = self.config.get("database_urls", option) if (self.config.has_option("ncbi", option) and self.config.get("ncbi", option) is not None): outfpath = self.config.get("ncbi", option) else: outfname = os.path.basename(ftp_fullpath) outfpath = os.path.join(self.ncbi_dir, outfname) logger.debug(f"UPDATE: (ncbi,{option}): {outfpath}") self.config.set("ncbi", option, outfpath) if self.dryrun: return rsync_fpath = ftp_fullpath.replace("ftp", "rsync", 1) cmd = ["rsync", "--quiet", "--archive", rsync_fpath, outfpath] logger.debug(f"starting {option} download") subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) checksum_outfpath = f"{outfpath}.md5" write_checksum(outfpath, checksum_outfpath) current_checksum = read_checksum(checksum_outfpath) current_hash, __ = current_checksum.split() remote_checksum = self.get_remote_checksum("ncbi", option) remote_hash, __ = remote_checksum.split() if current_hash != remote_hash: raise ChecksumMismatchError(f"{option} download failed") if "taxdump" in options: self.extract_taxdump() if "nr" in options: self.format_nr()
def format_nr(self) -> None: """Construct a diamond formatted database (nr.dmnd) from `nr` option in `ncbi` section in user config. NOTE: The checksum 'nr.dmnd.md5' will only be generated if nr.dmnd construction is successful. If the provided `nr` option in `ncbi` is 'nr.gz' the database will be removed after successful database formatting. Returns ------- NoneType config updated option:'nr' in section:'ncbi'. """ db_infpath = self.config.get("ncbi", "nr") db_infpath_md5 = f"{db_infpath}.md5" db_outfpath = db_infpath.replace(".gz", ".dmnd") db_outfpath_exists = os.path.exists(db_outfpath) if db_outfpath_exists: db_outfpath_hash, __ = calc_checksum(db_outfpath).split() remote_checksum_matches = False current_nr_checksum_matches = False # Check database and database checksum is up-to-date if os.path.exists(db_infpath_md5) and db_outfpath_exists: # Check if the current db md5 is up-to-date with the remote db md5 current_hash, __ = read_checksum(db_infpath_md5).split() remote_hash, __ = self.get_remote_checksum("ncbi", "nr").split() if remote_hash == current_hash: remote_checksum_matches = True # Check if the current db md5 matches the calc'd db checksum if db_outfpath_hash == current_hash: current_nr_checksum_matches = True db_outfpath_md5 = f"{db_outfpath}.md5" db_outfpath_md5_checksum_matches = False if os.path.exists(db_outfpath_md5) and db_outfpath_exists: db_outfpath_md5_hash, __ = read_checksum(db_outfpath_md5).split() if db_outfpath_hash == db_outfpath_md5_hash: db_outfpath_md5_checksum_matches = True checksum_checks = ["nr.dmnd.md5", "nr.gz.md5", "remote nr.gz.md5"] checksum_matches = [ db_outfpath_md5_checksum_matches, current_nr_checksum_matches, remote_checksum_matches, ] for checksum_match, checksum_check in zip(checksum_matches, checksum_checks): # If the checksums do not match, we need to update the database file. if checksum_match: logger.debug(f"{checksum_check} checksum matches, skipping...") self.config.set("ncbi", "nr", db_outfpath) logger.debug(f"set ncbi nr: {db_outfpath}") return # Only update out-of-date db files if user wants to update via self.update if not self.update and checksum_check == "remote nr.gz.md5": return diamond.makedatabase(fasta=db_infpath, database=db_outfpath, nproc=self.nproc) # Write checksum for nr.dmnd write_checksum(db_outfpath, db_outfpath_md5) if os.path.basename(db_infpath) == "nr.gz": # nr.gz will be removed after successful nr.dmnd construction os.remove(db_infpath) self.config.set("ncbi", "nr", db_outfpath) logger.debug(f"set ncbi nr: {db_outfpath}")