def check_prereqs() -> List[str]: failure_messages = [] for binary_name, optional in [('hmmscan', False), ('hmmpress', False)]: if path.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate executable for %r" % binary_name) markov_models = [path.get_full_path(__file__, 'data', filename) for filename in [ 'abmotifs.hmm', 'dockingdomains.hmm', 'ksdomains.hmm', 'nrpspksdomains.hmm']] binary_extensions = ['.h3f', '.h3i', '.h3m', '.h3p'] for hmm in markov_models: if path.locate_file(hmm) is None: failure_messages.append("Failed to locate file %r" % hmm) continue for ext in binary_extensions: binary = "{}{}".format(hmm, ext) if path.locate_file(binary) is None: result = subprocessing.run_hmmpress(hmm) if not result.successful(): failure_messages.append('Failed to hmmpress {!r}: {}'.format(hmm, result.stderr)) break return failure_messages
def check_prereqs() -> List[str]: "Checks if all required files and applications are around" _binary_extensions = ['.h3f', '.h3i', '.h3m', '.h3p'] failure_messages = [] for binary_name in ['hmmpfam2', 'hmmscan', 'hmmpress']: if not path.locate_executable(binary_name): failure_messages.append("Failed to locate file: %r" % binary_name) # Get all HMM profile names from XML file for profile in ["PKSI-KR.hmm2", "PKSI-KS_N.hmm2", "PKSI-KS_C.hmm2", "PKSI-AT.hmm2", "PKSI-ACP.hmm2", "PKSI-DH.hmm2", "Thioesterase.hmm2", "PKSI-ER.hmm2", "aa-activating.aroundLys.hmm2", "p450.hmm2"]: full_hmm_path = path.get_full_path(__file__, "data", profile) if path.locate_file(full_hmm_path) is None: failure_messages.append("Failed to locate file: %s" % profile) continue if profile.endswith(".hmm2"): continue for ext in _binary_extensions: binary = "{hmm}{ext}".format(hmm=full_hmm_path, ext=ext) if not path.locate_file(binary): result = subprocessing.run_hmmpress(full_hmm_path) if not result.successful(): failure_messages.append("Failed to hmmpress {!r}: {!r}".format(profile, result.stderr)) # hmmpress generates _all_ binary files in one go, so stop the loop break binary_mtime = os.path.getmtime(binary) hmm_mtime = os.path.getmtime(full_hmm_path) if hmm_mtime < binary_mtime: # generated file younger than hmm profile, do nothing continue try: for filename in glob.glob("{}.h3?".format(full_hmm_path)): logging.debug("removing outdated file %r", filename) os.remove(filename) except OSError as err: failure_messages.append("Failed to remove outdated binary file for %s: %s" % (profile, err)) break result = subprocessing.run_hmmpress(full_hmm_path) if not result.successful(): failure_messages.append("Failed to hmmpress %r: %r" % (profile, result.stderr)) failure_messages.append("HMM binary files outdated. %s (changed: %s) vs %s (changed: %s)" % (profile, datetime.datetime.fromtimestamp(hmm_mtime), binary, datetime.datetime.fromtimestamp(binary_mtime))) # hmmpress generates _all_ binary files in one go, so stop the loop break return failure_messages
def check_diamond_files(definition_file: str, fasta_file: str, db_file: str, logging_only: bool = False) -> List[str]: """ Check if the database files exist in the right version. Arguments: definition_file: the path to a database metadata file fasta_file: the path to a proteins fasta file db_file: the path to the diamond databse file logging_only: return a list of errors messages instead of raising errors Returns: a list of error strings """ failure_messages: List[str] = [] if path.locate_file(definition_file) is None: failure_messages.append( "Failed to locate cluster definition file: {!r}".format( definition_file)) regen_message = "" if path.locate_file(fasta_file) is None: failure_messages.append( "Failed to locate cluster proteins: {!r}".format(fasta_file)) if not logging_only: raise FileNotFoundError(failure_messages[-1]) elif path.locate_file(db_file) is None: regen_message = f"could not find diamond database: {db_file}" elif not check_diamond_db_compatible(db_file): regen_message = f"incompatible diamond database version: {db_file}" elif path.is_outdated(db_file, fasta_file): regen_message = f"diamond database outdated: {db_file}" if regen_message: try: logging.debug("%s, regenerating", regen_message) run_diamond_makedb(db_file, fasta_file) except RuntimeError: if not logging_only: raise failure_messages.append( "Failed to regenerate diamond database %r" % db_file) if failure_messages: failure_messages.append( f"with diamond executable: {get_config().executables.diamond}") return failure_messages
def check_clusterblast_files(definition_file: str, fasta_file: str, db_file: str, logging_only: bool = False) -> List[str]: """ Check if the clusterblast files exist in the right version. Arguments: definition_file: the path to the cluster definition TSV file fasta_file: the path to the cluster proteins fasta file db_file: the path to the diamond databse file Returns: A list of error strings the way `check_prereqs` does """ failure_messages: List[str] = [] if path.locate_file(definition_file) is None: failure_messages.append( "Failed to locate cluster definition file: {!r}".format( definition_file)) regen_message = "" if path.locate_file(fasta_file) is None: failure_messages.append( "Failed to locate cluster proteins: {!r}".format(fasta_file)) elif path.locate_file(db_file) is None: regen_message = "could not find diamond database: %s" % db_file elif not check_diamond_db_compatible(db_file): regen_message = "incompatible diamond database version: %s" % db_file elif path.is_outdated(db_file, fasta_file): regen_message = "diamond database outdated: %s" % db_file if regen_message: try: logging.debug("%s, regenerating", regen_message) subprocessing.run_diamond_makedb(db_file, fasta_file) except RuntimeError: if not logging_only: raise failure_messages.append( "Failed to regenerate diamond database %r" % db_file) if failure_messages: failure_messages.append("with diamond executable: %s" % get_config().executables.diamond) return failure_messages
def check_sub_prereqs(_options: ConfigType) -> List[str]: """ Check if all required applications and datafiles are present. options is irrelevant here """ # Tuple is ( binary_name, optional) _required_binaries = [ ('blastp', False), ('makeblastdb', False), ] _required_files = [ ('subclusterprots.fasta', False), ('subclusterprots.fasta.phr', False), ('subclusterprots.fasta.pin', False), ('subclusterprots.fasta.psq', False), ('subclusters.txt', False) ] failure_messages = [] for binary_name, optional in _required_binaries: if path.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate file: %r" % binary_name) for file_name, optional in _required_files: if path.locate_file(_get_datafile_path(file_name)) is None and not optional: failure_messages.append("Failed to locate file: %r" % file_name) return failure_messages
def check_prereqs() -> List[str]: "Check if all required applications are around" options = get_config() # Tuple is ( binary_name, optional) _required_binaries = [ ('blastp', False), ('makeblastdb', False), ('diamond', False), ] _required_files = [ ('geneclusterprots.dmnd', False), ('geneclusterprots.fasta', False), ('geneclusters.txt', False), ] clusterblastdir = os.path.join(options.database_dir, "clusterblast") failure_messages = [] for binary_name, optional in _required_binaries: if path.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate file: %r" % binary_name) for file_name, optional in _required_files: if path.locate_file(os.path.join(clusterblastdir, file_name)) is None and not optional: failure_messages.append("Failed to locate file: %r" % file_name) failure_messages.extend(check_known_prereqs(options)) failure_messages.extend(check_sub_prereqs(options)) return failure_messages
def check_prereqs() -> List[str]: """ Check that prereqs are satisfied. hmmpress is only required if the databases have not yet been generated. """ failure_messages = [] for binary_name, optional in [('hmmsearch', False), ('hmmpress', False)]: if path.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate executable for %r" % binary_name) profiles = None # Check that hmmdetails.txt is readable and well-formatted try: profiles = get_signature_profiles() except ValueError as err: failure_messages.append(str(err)) # the path to the markov model hmm = path.get_full_path(__file__, 'data', 'bgc_seeds.hmm') hmm_files = [os.path.join("data", sig.hmm_file) for sig in profiles] if path.locate_file(hmm) is None: # try to generate file from all specified profiles in hmmdetails try: with open(hmm, 'w') as all_hmms_handle: for hmm_file in hmm_files: with open(path.get_full_path(__file__, hmm_file), 'r') as handle: all_hmms_handle.write(handle.read()) except OSError: failure_messages.append('Failed to generate file {!r}'.format(hmm)) # if previous steps have failed, the remainder will too, so don't try if failure_messages: return failure_messages binary_extensions = ['.h3f', '.h3i', '.h3m', '.h3p'] for ext in binary_extensions: binary = "{}{}".format(hmm, ext) if path.locate_file(binary) is None: result = run_hmmpress(hmm) if not result.successful(): failure_messages.append('Failed to hmmpress {!r}: {}'.format( hmm, result.stderr)) break return failure_messages
def check_prereqs() -> List[str]: """Check for prerequisites pfam2go-march-2018.txt: mapping file for Pfam to Gene Ontology mapping """ failure_messages = [] if path.locate_file(path.get_full_path(__file__, 'data', 'pfam2go-march-2018.txt')) is None: failure_messages.append('Failed to locate Pfam to Gene Ontology mapping file') return failure_messages
def prepare_data(logging_only: bool = False) -> List[str]: """ Ensures packaged data is fully prepared Arguments: logging_only: whether to return error messages instead of raising exceptions Returns: a list of error messages (only if logging_only is True) """ failure_messages = [] # Check that hmmdetails.txt is readable and well-formatted try: profiles = get_signature_profiles() except ValueError as err: if not logging_only: raise return [str(err)] # the path to the markov model seeds_hmm = path.get_full_path(__file__, 'data', 'bgc_seeds.hmm') hmm_files = [os.path.join("data", sig.hmm_file) for sig in profiles] outdated = False if not path.locate_file(seeds_hmm): logging.debug("%s: %s doesn't exist, regenerating", NAME, seeds_hmm) outdated = True else: seeds_timestamp = os.path.getmtime(seeds_hmm) for component in hmm_files: if os.path.getmtime(component) > seeds_timestamp: logging.debug("%s out of date, regenerating", seeds_hmm) outdated = True break # regenerate if missing or out of date if outdated: # try to generate file from all specified profiles in hmmdetails try: with open(seeds_hmm, 'w') as all_hmms_handle: for hmm_file in hmm_files: with open(path.get_full_path(__file__, hmm_file), 'r') as handle: all_hmms_handle.write(handle.read()) except OSError: if not logging_only: raise failure_messages.append( 'Failed to generate file {!r}'.format(seeds_hmm)) # if regeneration failed, don't try to run hmmpress if failure_messages: return failure_messages failure_messages.extend( hmmer.ensure_database_pressed(seeds_hmm, return_not_raise=logging_only)) return failure_messages
def check_db(db_path: str) -> List[str]: "Check that all required files exist for a database" failure_messages = [] for file_name in ['Pfam-A.hmm', 'Pfam-A.hmm.h3f', 'Pfam-A.hmm.h3i', 'Pfam-A.hmm.h3m', 'Pfam-A.hmm.h3p']: if not path.locate_file(os.path.join(db_path, file_name)): failure_messages.append("Failed to locate file: %r in %s" % (file_name, db_path)) return failure_messages
def check_prereqs(_options: ConfigType) -> List[str]: """Check for prerequisites data file: mapping file for Pfam to Gene Ontology mapping """ failure_messages = [] if path.locate_file(DATA_FILE) is None: failure_messages.append( 'Failed to locate Pfam to Gene Ontology mapping file') return failure_messages
def check_prereqs() -> List[str]: "Check if all required applications are around" failure_messages = [] for binary_name in ['muscle', 'hmmscan', 'hmmpress', 'fasttree', 'java']: if path.locate_executable(binary_name) is None: failure_messages.append("Failed to locate file: %r" % binary_name) for hmm in ['smcogs.hmm']: hmm = path.get_full_path(__file__, 'data', hmm) if path.locate_file(hmm) is None: failure_messages.append("Failed to locate file %r" % hmm) continue for ext in ['.h3f', '.h3i', '.h3m', '.h3p']: binary = "%s%s" % (hmm, ext) if path.locate_file(binary) is None: # regenerate them result = subprocessing.run_hmmpress(hmm) if not result.successful(): failure_messages.append("Failed to hmmpress %s: %s" % (hmm, result.stderr.rstrip())) break return failure_messages
def check_clusterblast_files(definition_file: str, fasta_file: str, db_file: str, logging_only: bool = False) -> List[str]: """ Check if the clusterblast files exist in the right version. Arguments: definition_file: the path to the cluster definition TSV file fasta_file: the path to the cluster proteins fasta file db_file: the path to the diamond databse file Returns: A list of error strings the way `check_prereqs` does """ failure_messages = [] # type: List[str] if path.locate_file(definition_file) is None: failure_messages.append( "Failed to locate cluster definition file: {!r}".format( definition_file)) if path.locate_file(fasta_file) is None: failure_messages.append( "Failed to locate cluster proteins: {!r}".format(fasta_file)) elif path.locate_file(db_file) is None or not check_diamond_db_compatible( db_file): try: logging.debug( "diamond database %r missing or incompatible version, regenerating.", db_file) subprocessing.run_diamond_makedb(db_file, fasta_file) except RuntimeError: if not logging_only: raise failure_messages.append( "Failed to regenerate diamond database %r" % db_file) return failure_messages
def check_prereqs() -> List[str]: """ Check the prerequisites. hmmscan: domain detection blastp: CLF and starter unit analysis HMMs: t2pks.hmm Returns: a list of strings describing any errors, if they occurred """ failure_messages = [] for binary_name in ['hmmscan', 'blastp']: if path.locate_executable(binary_name) is None: failure_messages.append("Failed to locate file: %r" % binary_name) for hmm in ['t2pks.hmm']: hmm = path.get_full_path(__file__, 'data', hmm) if path.locate_file(hmm) is None: failure_messages.append("Failed to locate file %r" % hmm) continue for ext in ['.h3f', '.h3i', '.h3m', '.h3p']: binary = "%s%s" % (hmm, ext) if path.locate_file(binary) is None: # regenerate them result = subprocessing.run_hmmpress(hmm) if not result.successful(): failure_messages.append("Failed to hmmpress %s: %s" % (hmm, result.stderr.rstrip())) break for blastdb in ['KSIII', 'AT', 'LIG']: for ext in ['.fasta', '.phr', '.pin', '.psq']: dbfile = path.get_full_path(__file__, 'data', blastdb + ext) if path.locate_file(dbfile) is None: failure_messages.append("Failed to locate file %r" % dbfile) continue return failure_messages
def check_prereqs() -> List[str]: """Check for prerequisites """ failure_messages = [] for binary_name in ['hmmscan', 'hmmpress']: if path.locate_executable(binary_name) is None: failure_messages.append("Failed to locate file: %r" % binary_name) database = os.path.join(get_config().database_dir, 'resfam', 'Resfams.hmm') if path.locate_file(database) is None: failure_messages.append('Failed to locate Resfam database in %s' % database) failure_messages.extend(prepare_data(logging_only=True)) return failure_messages
def get_git_version(fallback_filename: Optional[str] = GIT_VERSION_FALLBACK_FILENAME) -> str: """Get the sha1 of the current git version""" git_version = "" try: version_cmd = execute(['git', 'rev-parse', '--short', 'HEAD']) status_cmd = execute(['git', 'status', '--porcelain']) if version_cmd.successful() and status_cmd.successful(): git_version = version_cmd.stdout.strip() changes = status_cmd.stdout.splitlines() if changes: git_version += "(changed)" except OSError: pass if git_version == "" and fallback_filename: if locate_file(fallback_filename, silent=True): with open(fallback_filename, 'rt') as handle: git_version = handle.read().strip() return git_version
def check_sub_prereqs(options: ConfigType) -> List[str]: """ Check if all required applications and datafiles are present. options is irrelevant here """ _required_binaries = ['blastp', 'makeblastdb'] _required_files = [ 'proteins.fasta', 'proteins.fasta.phr', 'proteins.fasta.pin', 'proteins.fasta.psq', 'clusters.txt' ] failure_messages = [] for binary_name in _required_binaries: if binary_name not in options.executables: failure_messages.append("Failed to locate file: %r" % binary_name) for file_name in _required_files: if path.locate_file(_get_datafile_path(file_name)) is None: failure_messages.append("Failed to locate file: %r" % file_name) return failure_messages
def check_prereqs(options: ConfigType) -> List[str]: "Checks if all required files and applications are around" failure_messages = [] for binary_name in ['hmmpfam2', 'hmmscan', 'hmmpress']: if binary_name not in options.executables: failure_messages.append("Failed to locate file: %r" % binary_name) # Get all HMM profile names from XML file for profile in [ "PKSI-KR.hmm2", "PKSI-KS_N.hmm2", "PKSI-KS_C.hmm2", "PKSI-AT.hmm2", "PKSI-ACP.hmm2", "PKSI-DH.hmm2", "Thioesterase.hmm2", "PKSI-ER.hmm2", "p450.hmm2" ]: full_hmm_path = path.get_full_path(__file__, "data", profile) if path.locate_file(full_hmm_path) is None: failure_messages.append("Failed to locate file: %s" % profile) continue return failure_messages
def ensure_database_pressed(filepath: str, return_not_raise: bool = False) -> List[str]: """ Ensures that the given HMMer database exists and that the hmmpress generated files aren't out of date. Arguments: filepath: the path to the HMMer database return_not_raise: whether to catch errors and return their messages as strings Returns: any encountered error messages, will never be populated without return_not_raise == True """ try: modified_time = os.path.getmtime(filepath) except FileNotFoundError as err: if not return_not_raise: raise return [str(err)] components = [ "{}{}".format(filepath, ext) for ext in ['.h3f', '.h3i', '.h3m', '.h3p'] ] outdated = False for component in components: if not path.locate_file( component) or os.path.getmtime(component) < modified_time: logging.info("%s does not exist or is out of date, hmmpressing %s", component, filepath) outdated = True break if outdated: result = subprocessing.run_hmmpress(filepath) if not result.successful(): msg = "Failed to hmmpress {!r}: {}".format(filepath, result.stderr) if not return_not_raise: raise RuntimeError(msg) return [msg] return []
def check_prereqs(options: ConfigType) -> List[str]: """ Ensure at least one database exists and is valid """ failure_messages = [] for binary_name in ['hmmscan']: if binary_name not in options.executables: failure_messages.append( f"Failed to locate executable: {binary_name!r}") # account for database directories mounted into docker containers if "mounted_at_runtime" in options.database_dir: return failure_messages tigr_db = os.path.join(options.database_dir, "tigrfam", "TIGRFam.hmm") if not path.locate_file(tigr_db): failure_messages.append( f"Failed to locate TIGRFam db in {os.path.join(options.database_dir, 'tigrfam')}" ) failure_messages.extend( hmmer.ensure_database_pressed(tigr_db, return_not_raise=True)) return failure_messages
def check_prereqs() -> List[str]: """ Check the prerequisites. hmmscan: domain detection blastp: CLF and starter unit analysis HMMs: t2pks.hmm Returns: a list of strings describing any errors, if they occurred """ failure_messages = [] for binary_name in ['hmmscan', "hmmpress", 'blastp']: if path.locate_executable(binary_name) is None: failure_messages.append("Failed to locate file: %r" % binary_name) for blastdb in ['KSIII', 'AT', 'LIG']: for ext in ['.fasta', '.phr', '.pin', '.psq']: dbfile = path.get_full_path(__file__, 'data', blastdb + ext) if path.locate_file(dbfile) is None: failure_messages.append("Failed to locate file %r" % dbfile) failure_messages.extend(prepare_data(logging_only=True)) return failure_messages
def check_known_prereqs(_options: ConfigType) -> List[str]: """ Determines if any prerequisite data files or executables are missing Arguments: options: antismash Config Returns: a list of error messages, one for each failing prequisite check """ failure_messages = [] for binary_name, optional in [('blastp', False), ('makeblastdb', False), ('diamond', False)]: if path.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate file: %r" % binary_name) for file_name, optional in [('knownclusterprots.fasta', False), ('knownclusterprots.dmnd', False), ('knownclusters.txt', False)]: if path.locate_file( _get_datafile_path(file_name)) is None and not optional: failure_messages.append("Failed to locate file: %r" % file_name) return failure_messages