Exemple #1
0
def check_prereqs() -> List[str]:
    failure_messages = []
    for binary_name, optional in [('hmmscan', False), ('hmmpress', False)]:
        if path.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate executable for %r" %
                                    binary_name)

    markov_models = [path.get_full_path(__file__, 'data', filename) for filename in [
                                'abmotifs.hmm', 'dockingdomains.hmm',
                                'ksdomains.hmm', 'nrpspksdomains.hmm']]

    binary_extensions = ['.h3f', '.h3i', '.h3m', '.h3p']

    for hmm in markov_models:
        if path.locate_file(hmm) is None:
            failure_messages.append("Failed to locate file %r" % hmm)
            continue
        for ext in binary_extensions:
            binary = "{}{}".format(hmm, ext)
            if path.locate_file(binary) is None:
                result = subprocessing.run_hmmpress(hmm)
                if not result.successful():
                    failure_messages.append('Failed to hmmpress {!r}: {}'.format(hmm, result.stderr))
                break

    return failure_messages
Exemple #2
0
def check_prereqs() -> List[str]:
    "Checks if all required files and applications are around"
    _binary_extensions = ['.h3f', '.h3i', '.h3m', '.h3p']

    failure_messages = []

    for binary_name in ['hmmpfam2', 'hmmscan', 'hmmpress']:
        if not path.locate_executable(binary_name):
            failure_messages.append("Failed to locate file: %r" % binary_name)

    # Get all HMM profile names from XML file
    for profile in ["PKSI-KR.hmm2", "PKSI-KS_N.hmm2", "PKSI-KS_C.hmm2", "PKSI-AT.hmm2",
                    "PKSI-ACP.hmm2", "PKSI-DH.hmm2", "Thioesterase.hmm2", "PKSI-ER.hmm2",
                    "aa-activating.aroundLys.hmm2", "p450.hmm2"]:
        full_hmm_path = path.get_full_path(__file__, "data", profile)

        if path.locate_file(full_hmm_path) is None:
            failure_messages.append("Failed to locate file: %s" % profile)
            continue

        if profile.endswith(".hmm2"):
            continue

        for ext in _binary_extensions:
            binary = "{hmm}{ext}".format(hmm=full_hmm_path, ext=ext)
            if not path.locate_file(binary):
                result = subprocessing.run_hmmpress(full_hmm_path)
                if not result.successful():
                    failure_messages.append("Failed to hmmpress {!r}: {!r}".format(profile, result.stderr))

                # hmmpress generates _all_ binary files in one go, so stop the loop
                break

            binary_mtime = os.path.getmtime(binary)
            hmm_mtime = os.path.getmtime(full_hmm_path)
            if hmm_mtime < binary_mtime:
                # generated file younger than hmm profile, do nothing
                continue
            try:
                for filename in glob.glob("{}.h3?".format(full_hmm_path)):
                    logging.debug("removing outdated file %r", filename)
                    os.remove(filename)
            except OSError as err:
                failure_messages.append("Failed to remove outdated binary file for %s: %s" %
                                        (profile, err))
                break
            result = subprocessing.run_hmmpress(full_hmm_path)
            if not result.successful():
                failure_messages.append("Failed to hmmpress %r: %r" % (profile, result.stderr))
                failure_messages.append("HMM binary files outdated. %s (changed: %s) vs %s (changed: %s)" %
                                        (profile, datetime.datetime.fromtimestamp(hmm_mtime),
                                         binary, datetime.datetime.fromtimestamp(binary_mtime)))
            # hmmpress generates _all_ binary files in one go, so stop the loop
            break

    return failure_messages
def check_diamond_files(definition_file: str,
                        fasta_file: str,
                        db_file: str,
                        logging_only: bool = False) -> List[str]:
    """ Check if the database files exist in the right version.

        Arguments:
            definition_file: the path to a database metadata file
            fasta_file: the path to a proteins fasta file
            db_file: the path to the diamond databse file
            logging_only: return a list of errors messages instead of raising errors

        Returns:
            a list of error strings
    """
    failure_messages: List[str] = []

    if path.locate_file(definition_file) is None:
        failure_messages.append(
            "Failed to locate cluster definition file: {!r}".format(
                definition_file))

    regen_message = ""

    if path.locate_file(fasta_file) is None:
        failure_messages.append(
            "Failed to locate cluster proteins: {!r}".format(fasta_file))
        if not logging_only:
            raise FileNotFoundError(failure_messages[-1])
    elif path.locate_file(db_file) is None:
        regen_message = f"could not find diamond database: {db_file}"
    elif not check_diamond_db_compatible(db_file):
        regen_message = f"incompatible diamond database version: {db_file}"
    elif path.is_outdated(db_file, fasta_file):
        regen_message = f"diamond database outdated: {db_file}"

    if regen_message:
        try:
            logging.debug("%s, regenerating", regen_message)
            run_diamond_makedb(db_file, fasta_file)
        except RuntimeError:
            if not logging_only:
                raise
            failure_messages.append(
                "Failed to regenerate diamond database %r" % db_file)

    if failure_messages:
        failure_messages.append(
            f"with diamond executable: {get_config().executables.diamond}")

    return failure_messages
Exemple #4
0
def check_clusterblast_files(definition_file: str,
                             fasta_file: str,
                             db_file: str,
                             logging_only: bool = False) -> List[str]:
    """ Check if the clusterblast files exist in the right version.

        Arguments:
            definition_file: the path to the cluster definition TSV file
            fasta_file: the path to the cluster proteins fasta file
            db_file: the path to the diamond databse file

        Returns:
            A list of error strings the way `check_prereqs` does
    """
    failure_messages: List[str] = []

    if path.locate_file(definition_file) is None:
        failure_messages.append(
            "Failed to locate cluster definition file: {!r}".format(
                definition_file))

    regen_message = ""

    if path.locate_file(fasta_file) is None:
        failure_messages.append(
            "Failed to locate cluster proteins: {!r}".format(fasta_file))
    elif path.locate_file(db_file) is None:
        regen_message = "could not find diamond database: %s" % db_file
    elif not check_diamond_db_compatible(db_file):
        regen_message = "incompatible diamond database version: %s" % db_file
    elif path.is_outdated(db_file, fasta_file):
        regen_message = "diamond database outdated: %s" % db_file

    if regen_message:
        try:
            logging.debug("%s, regenerating", regen_message)
            subprocessing.run_diamond_makedb(db_file, fasta_file)
        except RuntimeError:
            if not logging_only:
                raise
            failure_messages.append(
                "Failed to regenerate diamond database %r" % db_file)

    if failure_messages:
        failure_messages.append("with diamond executable: %s" %
                                get_config().executables.diamond)

    return failure_messages
Exemple #5
0
def check_sub_prereqs(_options: ConfigType) -> List[str]:
    """ Check if all required applications and datafiles are present.
        options is irrelevant here
    """
    # Tuple is ( binary_name, optional)
    _required_binaries = [
        ('blastp', False),
        ('makeblastdb', False),
    ]

    _required_files = [
        ('subclusterprots.fasta', False),
        ('subclusterprots.fasta.phr', False),
        ('subclusterprots.fasta.pin', False),
        ('subclusterprots.fasta.psq', False),
        ('subclusters.txt', False)
    ]
    failure_messages = []
    for binary_name, optional in _required_binaries:
        if path.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for file_name, optional in _required_files:
        if path.locate_file(_get_datafile_path(file_name)) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % file_name)

    return failure_messages
def check_prereqs() -> List[str]:
    "Check if all required applications are around"
    options = get_config()
    # Tuple is ( binary_name, optional)
    _required_binaries = [
        ('blastp', False),
        ('makeblastdb', False),
        ('diamond', False),
    ]

    _required_files = [
        ('geneclusterprots.dmnd', False),
        ('geneclusterprots.fasta', False),
        ('geneclusters.txt', False),
    ]

    clusterblastdir = os.path.join(options.database_dir, "clusterblast")

    failure_messages = []
    for binary_name, optional in _required_binaries:
        if path.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for file_name, optional in _required_files:
        if path.locate_file(os.path.join(clusterblastdir,
                                         file_name)) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % file_name)

    failure_messages.extend(check_known_prereqs(options))
    failure_messages.extend(check_sub_prereqs(options))
    return failure_messages
Exemple #7
0
def check_prereqs() -> List[str]:
    """ Check that prereqs are satisfied. hmmpress is only required if the
        databases have not yet been generated.
    """
    failure_messages = []
    for binary_name, optional in [('hmmsearch', False), ('hmmpress', False)]:
        if path.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate executable for %r" %
                                    binary_name)

    profiles = None
    # Check that hmmdetails.txt is readable and well-formatted
    try:
        profiles = get_signature_profiles()
    except ValueError as err:
        failure_messages.append(str(err))

    # the path to the markov model
    hmm = path.get_full_path(__file__, 'data', 'bgc_seeds.hmm')
    hmm_files = [os.path.join("data", sig.hmm_file) for sig in profiles]
    if path.locate_file(hmm) is None:
        # try to generate file from all specified profiles in hmmdetails
        try:
            with open(hmm, 'w') as all_hmms_handle:
                for hmm_file in hmm_files:
                    with open(path.get_full_path(__file__, hmm_file),
                              'r') as handle:
                        all_hmms_handle.write(handle.read())
        except OSError:
            failure_messages.append('Failed to generate file {!r}'.format(hmm))

    # if previous steps have failed, the remainder will too, so don't try
    if failure_messages:
        return failure_messages

    binary_extensions = ['.h3f', '.h3i', '.h3m', '.h3p']
    for ext in binary_extensions:
        binary = "{}{}".format(hmm, ext)
        if path.locate_file(binary) is None:
            result = run_hmmpress(hmm)
            if not result.successful():
                failure_messages.append('Failed to hmmpress {!r}: {}'.format(
                    hmm, result.stderr))
            break

    return failure_messages
Exemple #8
0
def check_prereqs() -> List[str]:
    """Check for prerequisites
        pfam2go-march-2018.txt: mapping file for Pfam to Gene Ontology mapping
    """
    failure_messages = []
    if path.locate_file(path.get_full_path(__file__, 'data', 'pfam2go-march-2018.txt')) is None:
        failure_messages.append('Failed to locate Pfam to Gene Ontology mapping file')
    return failure_messages
Exemple #9
0
def prepare_data(logging_only: bool = False) -> List[str]:
    """ Ensures packaged data is fully prepared

        Arguments:
            logging_only: whether to return error messages instead of raising exceptions

        Returns:
            a list of error messages (only if logging_only is True)
    """
    failure_messages = []

    # Check that hmmdetails.txt is readable and well-formatted
    try:
        profiles = get_signature_profiles()
    except ValueError as err:
        if not logging_only:
            raise
        return [str(err)]

    # the path to the markov model
    seeds_hmm = path.get_full_path(__file__, 'data', 'bgc_seeds.hmm')
    hmm_files = [os.path.join("data", sig.hmm_file) for sig in profiles]
    outdated = False
    if not path.locate_file(seeds_hmm):
        logging.debug("%s: %s doesn't exist, regenerating", NAME, seeds_hmm)
        outdated = True
    else:
        seeds_timestamp = os.path.getmtime(seeds_hmm)
        for component in hmm_files:
            if os.path.getmtime(component) > seeds_timestamp:
                logging.debug("%s out of date, regenerating", seeds_hmm)
                outdated = True
                break

    # regenerate if missing or out of date
    if outdated:
        # try to generate file from all specified profiles in hmmdetails
        try:
            with open(seeds_hmm, 'w') as all_hmms_handle:
                for hmm_file in hmm_files:
                    with open(path.get_full_path(__file__, hmm_file),
                              'r') as handle:
                        all_hmms_handle.write(handle.read())
        except OSError:
            if not logging_only:
                raise
            failure_messages.append(
                'Failed to generate file {!r}'.format(seeds_hmm))

    # if regeneration failed, don't try to run hmmpress
    if failure_messages:
        return failure_messages

    failure_messages.extend(
        hmmer.ensure_database_pressed(seeds_hmm,
                                      return_not_raise=logging_only))

    return failure_messages
Exemple #10
0
def check_db(db_path: str) -> List[str]:
    "Check that all required files exist for a database"
    failure_messages = []
    for file_name in ['Pfam-A.hmm', 'Pfam-A.hmm.h3f', 'Pfam-A.hmm.h3i',
                      'Pfam-A.hmm.h3m', 'Pfam-A.hmm.h3p']:
        if not path.locate_file(os.path.join(db_path, file_name)):
            failure_messages.append("Failed to locate file: %r in %s" % (file_name, db_path))

    return failure_messages
Exemple #11
0
def check_prereqs(_options: ConfigType) -> List[str]:
    """Check for prerequisites
        data file: mapping file for Pfam to Gene Ontology mapping
    """
    failure_messages = []
    if path.locate_file(DATA_FILE) is None:
        failure_messages.append(
            'Failed to locate Pfam to Gene Ontology mapping file')
    return failure_messages
Exemple #12
0
def check_prereqs() -> List[str]:
    "Check if all required applications are around"
    failure_messages = []
    for binary_name in ['muscle', 'hmmscan', 'hmmpress', 'fasttree', 'java']:
        if path.locate_executable(binary_name) is None:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for hmm in ['smcogs.hmm']:
        hmm = path.get_full_path(__file__, 'data', hmm)
        if path.locate_file(hmm) is None:
            failure_messages.append("Failed to locate file %r" % hmm)
            continue
        for ext in ['.h3f', '.h3i', '.h3m', '.h3p']:
            binary = "%s%s" % (hmm, ext)
            if path.locate_file(binary) is None:
                # regenerate them
                result = subprocessing.run_hmmpress(hmm)
                if not result.successful():
                    failure_messages.append("Failed to hmmpress %s: %s" %
                                            (hmm, result.stderr.rstrip()))
                break
    return failure_messages
Exemple #13
0
def check_clusterblast_files(definition_file: str,
                             fasta_file: str,
                             db_file: str,
                             logging_only: bool = False) -> List[str]:
    """ Check if the clusterblast files exist in the right version.

        Arguments:
            definition_file: the path to the cluster definition TSV file
            fasta_file: the path to the cluster proteins fasta file
            db_file: the path to the diamond databse file

        Returns:
            A list of error strings the way `check_prereqs` does
    """
    failure_messages = []  # type: List[str]

    if path.locate_file(definition_file) is None:
        failure_messages.append(
            "Failed to locate cluster definition file: {!r}".format(
                definition_file))

    if path.locate_file(fasta_file) is None:
        failure_messages.append(
            "Failed to locate cluster proteins: {!r}".format(fasta_file))
    elif path.locate_file(db_file) is None or not check_diamond_db_compatible(
            db_file):
        try:
            logging.debug(
                "diamond database %r missing or incompatible version, regenerating.",
                db_file)
            subprocessing.run_diamond_makedb(db_file, fasta_file)
        except RuntimeError:
            if not logging_only:
                raise
            failure_messages.append(
                "Failed to regenerate diamond database %r" % db_file)

    return failure_messages
Exemple #14
0
def check_prereqs() -> List[str]:
    """ Check the prerequisites.
            hmmscan: domain detection
            blastp: CLF and starter unit analysis
            HMMs: t2pks.hmm

        Returns:
            a list of strings describing any errors, if they occurred
    """
    failure_messages = []
    for binary_name in ['hmmscan', 'blastp']:
        if path.locate_executable(binary_name) is None:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for hmm in ['t2pks.hmm']:
        hmm = path.get_full_path(__file__, 'data', hmm)
        if path.locate_file(hmm) is None:
            failure_messages.append("Failed to locate file %r" % hmm)
            continue
        for ext in ['.h3f', '.h3i', '.h3m', '.h3p']:
            binary = "%s%s" % (hmm, ext)
            if path.locate_file(binary) is None:
                # regenerate them
                result = subprocessing.run_hmmpress(hmm)
                if not result.successful():
                    failure_messages.append("Failed to hmmpress %s: %s" %
                                            (hmm, result.stderr.rstrip()))
                break

    for blastdb in ['KSIII', 'AT', 'LIG']:
        for ext in ['.fasta', '.phr', '.pin', '.psq']:
            dbfile = path.get_full_path(__file__, 'data', blastdb + ext)
            if path.locate_file(dbfile) is None:
                failure_messages.append("Failed to locate file %r" % dbfile)
                continue

    return failure_messages
Exemple #15
0
def check_prereqs() -> List[str]:
    """Check for prerequisites
    """
    failure_messages = []

    for binary_name in ['hmmscan', 'hmmpress']:
        if path.locate_executable(binary_name) is None:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    database = os.path.join(get_config().database_dir, 'resfam', 'Resfams.hmm')
    if path.locate_file(database) is None:
        failure_messages.append('Failed to locate Resfam database in %s' %
                                database)

    failure_messages.extend(prepare_data(logging_only=True))

    return failure_messages
def get_git_version(fallback_filename: Optional[str] = GIT_VERSION_FALLBACK_FILENAME) -> str:
    """Get the sha1 of the current git version"""
    git_version = ""
    try:
        version_cmd = execute(['git', 'rev-parse', '--short', 'HEAD'])
        status_cmd = execute(['git', 'status', '--porcelain'])
        if version_cmd.successful() and status_cmd.successful():
            git_version = version_cmd.stdout.strip()
            changes = status_cmd.stdout.splitlines()
            if changes:
                git_version += "(changed)"
    except OSError:
        pass
    if git_version == "" and fallback_filename:
        if locate_file(fallback_filename, silent=True):
            with open(fallback_filename, 'rt') as handle:
                git_version = handle.read().strip()
    return git_version
Exemple #17
0
def check_sub_prereqs(options: ConfigType) -> List[str]:
    """ Check if all required applications and datafiles are present.
        options is irrelevant here
    """
    _required_binaries = ['blastp', 'makeblastdb']

    _required_files = [
        'proteins.fasta', 'proteins.fasta.phr', 'proteins.fasta.pin',
        'proteins.fasta.psq', 'clusters.txt'
    ]
    failure_messages = []
    for binary_name in _required_binaries:
        if binary_name not in options.executables:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for file_name in _required_files:
        if path.locate_file(_get_datafile_path(file_name)) is None:
            failure_messages.append("Failed to locate file: %r" % file_name)

    return failure_messages
Exemple #18
0
def check_prereqs(options: ConfigType) -> List[str]:
    "Checks if all required files and applications are around"
    failure_messages = []

    for binary_name in ['hmmpfam2', 'hmmscan', 'hmmpress']:
        if binary_name not in options.executables:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    # Get all HMM profile names from XML file
    for profile in [
            "PKSI-KR.hmm2", "PKSI-KS_N.hmm2", "PKSI-KS_C.hmm2", "PKSI-AT.hmm2",
            "PKSI-ACP.hmm2", "PKSI-DH.hmm2", "Thioesterase.hmm2",
            "PKSI-ER.hmm2", "p450.hmm2"
    ]:
        full_hmm_path = path.get_full_path(__file__, "data", profile)

        if path.locate_file(full_hmm_path) is None:
            failure_messages.append("Failed to locate file: %s" % profile)
            continue

    return failure_messages
Exemple #19
0
def ensure_database_pressed(filepath: str,
                            return_not_raise: bool = False) -> List[str]:
    """ Ensures that the given HMMer database exists and that the hmmpress
        generated files aren't out of date.

        Arguments:
            filepath: the path to the HMMer database
            return_not_raise: whether to catch errors and return their messages as strings

        Returns:
            any encountered error messages, will never be populated without return_not_raise == True
    """
    try:
        modified_time = os.path.getmtime(filepath)
    except FileNotFoundError as err:
        if not return_not_raise:
            raise
        return [str(err)]
    components = [
        "{}{}".format(filepath, ext)
        for ext in ['.h3f', '.h3i', '.h3m', '.h3p']
    ]
    outdated = False
    for component in components:
        if not path.locate_file(
                component) or os.path.getmtime(component) < modified_time:
            logging.info("%s does not exist or is out of date, hmmpressing %s",
                         component, filepath)
            outdated = True
            break

    if outdated:
        result = subprocessing.run_hmmpress(filepath)
        if not result.successful():
            msg = "Failed to hmmpress {!r}: {}".format(filepath, result.stderr)
            if not return_not_raise:
                raise RuntimeError(msg)
            return [msg]
    return []
Exemple #20
0
def check_prereqs(options: ConfigType) -> List[str]:
    """ Ensure at least one database exists and is valid """
    failure_messages = []
    for binary_name in ['hmmscan']:
        if binary_name not in options.executables:
            failure_messages.append(
                f"Failed to locate executable: {binary_name!r}")

    # account for database directories mounted into docker containers
    if "mounted_at_runtime" in options.database_dir:
        return failure_messages

    tigr_db = os.path.join(options.database_dir, "tigrfam", "TIGRFam.hmm")
    if not path.locate_file(tigr_db):
        failure_messages.append(
            f"Failed to locate TIGRFam db in {os.path.join(options.database_dir, 'tigrfam')}"
        )

    failure_messages.extend(
        hmmer.ensure_database_pressed(tigr_db, return_not_raise=True))

    return failure_messages
Exemple #21
0
def check_prereqs() -> List[str]:
    """ Check the prerequisites.
            hmmscan: domain detection
            blastp: CLF and starter unit analysis
            HMMs: t2pks.hmm

        Returns:
            a list of strings describing any errors, if they occurred
    """
    failure_messages = []
    for binary_name in ['hmmscan', "hmmpress", 'blastp']:
        if path.locate_executable(binary_name) is None:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for blastdb in ['KSIII', 'AT', 'LIG']:
        for ext in ['.fasta', '.phr', '.pin', '.psq']:
            dbfile = path.get_full_path(__file__, 'data', blastdb + ext)
            if path.locate_file(dbfile) is None:
                failure_messages.append("Failed to locate file %r" % dbfile)

    failure_messages.extend(prepare_data(logging_only=True))

    return failure_messages
def check_known_prereqs(_options: ConfigType) -> List[str]:
    """ Determines if any prerequisite data files or executables are missing

        Arguments:
            options: antismash Config

        Returns:
            a list of error messages, one for each failing prequisite check
    """
    failure_messages = []
    for binary_name, optional in [('blastp', False), ('makeblastdb', False),
                                  ('diamond', False)]:
        if path.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for file_name, optional in [('knownclusterprots.fasta', False),
                                ('knownclusterprots.dmnd', False),
                                ('knownclusters.txt', False)]:
        if path.locate_file(
                _get_datafile_path(file_name)) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % file_name)

    return failure_messages