コード例 #1
0
ファイル: helpers.py プロジェクト: zachcp/antismash
def check_hmm_signatures(signature_file, hmm_dir):
    sigs = get_signature_profiles(signature_file)
    for sig in sigs:
        hmm_file = os.path.abspath(os.path.join(hmm_dir, sig.path))
        assert os.path.exists(hmm_file)
        name = None
        for line in open(hmm_file):
            if line.startswith("NAME"):
                name = line.split()[-1]
        assert name
        assert name == sig.name, "%s != %s" % (name, sig.name)
コード例 #2
0
ファイル: signatures.py プロジェクト: zachcp/antismash
def get_signature_profiles() -> List[signature.HmmSignature]:
    """ Generates the HMM signature profiles from hmmdetails.txt
        Only does the processing once per python invocation, future runs access
        existing profiles
    """
    # if already called once, then just reuse the cached results
    existing = getattr(get_signature_profiles, 'existing', None)
    if existing is not None:
        assert isinstance(existing, list)
        return existing

    # not cached yet, so generate
    profiles = signature.get_signature_profiles(path.get_full_path(__file__, "data", "hmmdetails.txt"))

    # cache this for future reuse, and silence mypy warnings because it can't handle it
    get_signature_profiles.existing = profiles  # type: ignore

    return profiles
コード例 #3
0
def get_sequence_counts(details_file: str) -> Dict[str, int]:
    """ Gets the number of sequences/seeds used to generate each HMM signature

        Arguments:
            detail_file: a file containing all HMMs

        Returns:
            a dictionary mapping HMM name to the number of sequences used to
                generate it
    """
    result = {}
    for hmm in get_signature_profiles(details_file):
        for line in open(path.get_full_path(details_file, hmm.hmm_file), 'r'):
            if line.startswith('NSEQ '):
                result[hmm.name] = int(line[6:].strip())
                break
        if hmm.name not in result:
            raise ValueError("Unknown number of seeds for hmm file: %s" % details_file)

    return result
コード例 #4
0
def get_sequence_counts(details_file: str) -> Dict[str, str]:
    """ Gets the number of sequences/seeds used to generate each HMM signature

        Arguments:
            detail_file: a file containing all HMMs

        Returns:
            a dictionary mapping HMM name to the number of sequences used to
                generate it
    """
    result = {}
    for hmm in get_signature_profiles(details_file):
        for line in open(path.get_full_path(details_file, hmm.hmm_file), 'r'):
            if line.startswith('NSEQ '):
                result[hmm.name] = line[6:].strip()
                break
        # TODO: ideally this shouldn't ever happen, clean up inputs and change to error
        if hmm.name not in result:
            result[hmm.name] = "?"

    return result
コード例 #5
0
def detect_protoclusters_and_signatures(record: Record, signature_file: str, seeds_file: str,
                                        rule_files: List[str], valid_categories: Set[str],
                                        filter_file: str, tool: str,
                                        annotate_existing_subregions: bool = True) -> RuleDetectionResults:
    """ Compares all CDS features in a record with HMM signatures and generates
        Protocluster features based on those hits and the current protocluster detection
        rules.

        Arguments:
            record: the record to analyse
            signature_file: a tab separated file; each row being a single HMM reference
                        with columns: label, description, minimum score cutoff, hmm path
            seeds_file: the file containing all HMM profiles
            rule_files: the files containing the rules to use for cluster definition
            valid_categories: a set containing valid rule category strings
            filter_file: a file containing equivalence sets of HMMs
            tool: the name of the tool providing the HMMs (e.g. rule_based_clusters)
            annotate_existing_subregions: if True, subregions already present in the record
                    will have domains annotated even if no protocluster is found
    """
    if not rule_files:
        raise ValueError("rules must be provided")
    full_fasta = fasta.get_fasta_from_record(record)
    # if there's no CDS features, don't try to do anything
    if not full_fasta:
        return RuleDetectionResults({}, tool, [])
    sig_by_name = {sig.name: sig for sig in get_signature_profiles(signature_file)}
    rules: List[rule_parser.DetectionRule] = []
    aliases: Dict[str, List[rule_parser.Token]] = {}
    for rule_file in rule_files:
        rules = create_rules(rule_file, set(sig_by_name), valid_categories, aliases, rules)
    results = []
    results_by_id: Dict[str, HSP] = {}

    runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True)
    for runresult in runresults:
        acc = runresult.accession.split('.')[0]
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.query_id in sig_by_name:
                sig = sig_by_name[hsp.query_id]
            elif acc in sig_by_name:
                sig = sig_by_name[acc]
            else:
                raise ValueError('Failed to find signature for ID %s / ACC %s' % (
                                                    hsp.query_id, acc))
            if hsp.bitscore > sig.cutoff:
                results.append(hsp)
                if hsp.hit_id not in results_by_id:
                    results_by_id[hsp.hit_id] = [hsp]
                else:
                    results_by_id[hsp.hit_id].append(hsp)

    # Filter results by comparing scores of different models (for PKS systems)
    results, results_by_id = filter_results(results, results_by_id, filter_file, set(sig_by_name))

    # Filter multiple results of the same model in one gene
    results, results_by_id = filter_result_multiple(results, results_by_id)

    # Use rules to determine gene clusters
    cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules(record, results_by_id, rules)

    # Find number of sequences on which each pHMM is based
    num_seeds_per_hmm = get_sequence_counts(signature_file)

    # annotate everything in detected protoclusters
    rules_by_name = {rule.name: rule for rule in rules}
    clusters = find_protoclusters(record, cluster_type_hits, rules_by_name)
    strip_inferior_domains(cds_domains_by_cluster, rules_by_name)

    def get_domains_for_cds(cds: CDSFeature) -> List[SecMetQualifier.Domain]:
        domains = []
        for hsp in results_by_id.get(cds.get_name(), []):
            domains.append(SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore,
                                                  num_seeds_per_hmm[hsp.query_id], tool))
        return domains

    cds_results_by_cluster = {}
    cdses_with_annotations = set()
    for cluster in clusters:
        cds_results = []
        for cds in record.get_cds_features_within_location(cluster.location):
            domains = get_domains_for_cds(cds)
            if domains:
                cds_results.append(CDSResults(cds, domains, cds_domains_by_cluster.get(cds.get_name(), {})))
                cdses_with_annotations.add(cds)
        cds_results_by_cluster[cluster] = cds_results

    # add detected profile annotations for any existing subregions, if enabled
    cds_results_outside_clusters = []
    if annotate_existing_subregions:
        for subregion in record.get_subregions():
            for cds in subregion.cds_children:
                if cds in cdses_with_annotations:
                    continue
                domains = get_domains_for_cds(cds)
                if domains:
                    cds_results_outside_clusters.append(CDSResults(cds, domains, {}))
                    cdses_with_annotations.add(cds)

    return RuleDetectionResults(cds_results_by_cluster, tool, cds_results_outside_clusters)
コード例 #6
0
def detect_borders_and_signatures(record: Record, signature_file: str,
                                  seeds_file: str, rules_file: str,
                                  filter_file: str,
                                  tool: str) -> RuleDetectionResults:
    """ Compares all CDS features in a record with HMM signatures and generates
        Cluster features based on those hits and the current cluster detection
        rules.

        Arguments:
            record: the record to analyse
            signature_file: a tab separated file; each row being a single HMM reference
                        with columns: label, description, minimum score cutoff, hmm path
            seeds_file: the file containing all HMM profiles
            rules_file: the file containing all the rules to use for cluster definition
            filter_file: a file containing equivalence sets of HMMs
            tool: the name of the tool providing the HMMs (e.g. clusterfinder, rule_based_clusters)
    """
    full_fasta = fasta.get_fasta_from_record(record)
    # if there's no CDS features, don't try to do anything
    if not full_fasta:
        return None
    sig_by_name = {
        sig.name: sig
        for sig in get_signature_profiles(signature_file)
    }
    rules = create_rules(rules_file, set(sig_by_name))
    results = []
    results_by_id = {}  # type: Dict[str, HSP]

    runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True)
    for runresult in runresults:
        acc = runresult.accession.split('.')[0]
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.query_id in sig_by_name:
                sig = sig_by_name[hsp.query_id]
            elif acc in sig_by_name:
                sig = sig_by_name[acc]
            else:
                raise ValueError(
                    'Failed to find signature for ID %s / ACC %s' %
                    (hsp.query_id, acc))
            if hsp.bitscore > sig.cutoff:
                results.append(hsp)
                if hsp.hit_id not in results_by_id:
                    results_by_id[hsp.hit_id] = [hsp]
                else:
                    results_by_id[hsp.hit_id].append(hsp)

    # Filter results by comparing scores of different models (for PKS systems)
    results, results_by_id = filter_results(results, results_by_id,
                                            filter_file, set(sig_by_name))

    # Filter multiple results of the same model in one gene
    results, results_by_id = filter_result_multiple(results, results_by_id)

    # Use rules to determine gene clusters
    cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules(
        record, results_by_id, rules)

    # Find number of sequences on which each pHMM is based
    num_seeds_per_hmm = get_sequence_counts(signature_file)

    # Save final results to record
    rules_by_name = {rule.name: rule for rule in rules}
    clusters = find_clusters(record, cluster_type_hits, rules_by_name)
    strip_inferior_domains(cds_domains_by_cluster, rules_by_name)

    cds_results_by_cluster = {}
    for cluster in clusters:
        record.add_cluster_border(cluster)
        cds_results = []
        cluster_extent = FeatureLocation(
            cluster.location.start - cluster.extent,
            cluster.location.end + cluster.extent)
        for cds in record.get_cds_features_within_location(cluster_extent):
            domains = []
            for hsp in results_by_id.get(cds.get_name(), []):
                domains.append(
                    SecMetQualifier.Domain(hsp.query_id, hsp.evalue,
                                           hsp.bitscore,
                                           num_seeds_per_hmm[hsp.query_id],
                                           tool))
            if domains:
                cds_results.append(
                    CDSResults(cds, domains,
                               cds_domains_by_cluster.get(cds.get_name(), {})))
        cds_results_by_cluster[cluster] = cds_results

    return RuleDetectionResults(cds_results_by_cluster, tool)