Ejemplo n.º 1
0
    def test_with_secmet(self):
        domains = [
            SecMetQualifier.Domain("testA", 0.1, 1.1, 3, "test"),
            SecMetQualifier.Domain("testB", 5.1, 3.9, 5, "dummy")
        ]
        self.cds.sec_met = SecMetQualifier(domains)
        bio = self.convert()
        assert "sec_met" not in bio.qualifiers  # again, detecting leftover legacy versions
        assert len(bio.qualifiers["sec_met_domain"]) == 2
        assert bio.qualifiers["sec_met_domain"] == list(map(str, domains))

        regen = CDSFeature.from_biopython(bio)
        assert regen.sec_met
        assert len(regen.sec_met.domains) == len(domains)
        assert regen.sec_met.domains == domains
def annotate_orfs(cds_features: List[secmet.CDSFeature], hmm_results: Dict[str, List[HSP]]) -> None:
    """ Annotates newly found ORFs with sactipeptide domain information.
        This is only relevant for CDS features that did not exist during
        the cluster detection stage of antiSMASH.
    """

    domains_by_feature: Dict[str, List[SecMetQualifier.Domain]] = defaultdict(list)
    for hit_id, results in hmm_results.items():
        for result in results:
            domain = SecMetQualifier.Domain(result.query_id, result.evalue, result.bitscore, 0, "sactipeptides")
            domains_by_feature[hit_id].append(domain)
    for cds in cds_features:
        domains = domains_by_feature[cds.get_name()]
        if domains:
            cds.sec_met = SecMetQualifier(domains)
 def get_domains_for_cds(cds: CDSFeature) -> List[SecMetQualifier.Domain]:
     domains = []
     for hsp in results_by_id.get(cds.get_name(), []):
         domains.append(SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore,
                                               num_seeds_per_hmm[hsp.query_id], tool))
     return domains
Ejemplo n.º 4
0
def detect_protoclusters_and_signatures(record: Record, signature_file: str,
                                        seeds_file: str, rule_files: List[str],
                                        filter_file: str,
                                        tool: str) -> RuleDetectionResults:
    """ Compares all CDS features in a record with HMM signatures and generates
        Protocluster features based on those hits and the current protocluster detection
        rules.

        Arguments:
            record: the record to analyse
            signature_file: a tab separated file; each row being a single HMM reference
                        with columns: label, description, minimum score cutoff, hmm path
            seeds_file: the file containing all HMM profiles
            rule_files: the files containing the rules to use for cluster definition
            filter_file: a file containing equivalence sets of HMMs
            tool: the name of the tool providing the HMMs (e.g. clusterfinder, rule_based_clusters)
    """
    if not rule_files:
        raise ValueError("rules must be provided")
    full_fasta = fasta.get_fasta_from_record(record)
    # if there's no CDS features, don't try to do anything
    if not full_fasta:
        return RuleDetectionResults({}, tool)
    sig_by_name = {
        sig.name: sig
        for sig in get_signature_profiles(signature_file)
    }
    rules = []  # type: List[rule_parser.DetectionRule]
    for rule_file in rule_files:
        rules = create_rules(rule_file, set(sig_by_name), rules)
    results = []
    results_by_id = {}  # type: Dict[str, HSP]

    runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True)
    for runresult in runresults:
        acc = runresult.accession.split('.')[0]
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.query_id in sig_by_name:
                sig = sig_by_name[hsp.query_id]
            elif acc in sig_by_name:
                sig = sig_by_name[acc]
            else:
                raise ValueError(
                    'Failed to find signature for ID %s / ACC %s' %
                    (hsp.query_id, acc))
            if hsp.bitscore > sig.cutoff:
                results.append(hsp)
                if hsp.hit_id not in results_by_id:
                    results_by_id[hsp.hit_id] = [hsp]
                else:
                    results_by_id[hsp.hit_id].append(hsp)

    # Filter results by comparing scores of different models (for PKS systems)
    results, results_by_id = filter_results(results, results_by_id,
                                            filter_file, set(sig_by_name))

    # Filter multiple results of the same model in one gene
    results, results_by_id = filter_result_multiple(results, results_by_id)

    # Use rules to determine gene clusters
    cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules(
        record, results_by_id, rules)

    # Find number of sequences on which each pHMM is based
    num_seeds_per_hmm = get_sequence_counts(signature_file)

    # Save final results to record
    rules_by_name = {rule.name: rule for rule in rules}
    clusters = find_protoclusters(record, cluster_type_hits, rules_by_name)
    strip_inferior_domains(cds_domains_by_cluster, rules_by_name)

    cds_results_by_cluster = {}
    for cluster in clusters:
        cds_results = []
        for cds in record.get_cds_features_within_location(cluster.location):
            domains = []
            for hsp in results_by_id.get(cds.get_name(), []):
                domains.append(
                    SecMetQualifier.Domain(hsp.query_id, hsp.evalue,
                                           hsp.bitscore,
                                           num_seeds_per_hmm[hsp.query_id],
                                           tool))
            if domains:
                cds_results.append(
                    CDSResults(cds, domains,
                               cds_domains_by_cluster.get(cds.get_name(), {})))
        cds_results_by_cluster[cluster] = cds_results

    return RuleDetectionResults(cds_results_by_cluster, tool)