예제 #1
0
    def test_with_secmet(self):
        domains = [
            SecMetQualifier.Domain("testA", 0.1, 1.1, 3, "test"),
            SecMetQualifier.Domain("testB", 5.1, 3.9, 5, "dummy")
        ]
        self.cds.sec_met = SecMetQualifier(domains)
        bio = self.convert()
        assert "sec_met" not in bio.qualifiers  # again, detecting leftover legacy versions
        assert len(bio.qualifiers["sec_met_domain"]) == 2
        assert bio.qualifiers["sec_met_domain"] == list(map(str, domains))

        regen = CDSFeature.from_biopython(bio)
        assert regen.sec_met
        assert len(regen.sec_met.domains) == len(domains)
        assert regen.sec_met.domains == domains
def annotate_orfs(cds_features: List[secmet.CDSFeature], hmm_results: Dict[str, List[HSP]]) -> None:
    """ Annotates newly found ORFs with sactipeptide domain information.
        This is only relevant for CDS features that did not exist during
        the cluster detection stage of antiSMASH.
    """

    domains_by_feature: Dict[str, List[SecMetQualifier.Domain]] = defaultdict(list)
    for hit_id, results in hmm_results.items():
        for result in results:
            domain = SecMetQualifier.Domain(result.query_id, result.evalue, result.bitscore, 0, "sactipeptides")
            domains_by_feature[hit_id].append(domain)
    for cds in cds_features:
        domains = domains_by_feature[cds.get_name()]
        if domains:
            cds.sec_met = SecMetQualifier(domains)
예제 #3
0
    def __init__(self, location: Location, translation: str, locus_tag: str = None,
                 protein_id: str = None, product: str = "", gene: str = None,
                 translation_table: int = 1) -> None:
        super().__init__(location, feature_type="CDS")
        _verify_location(location)
        # mandatory
        self._gene_functions = GeneFunctionAnnotations()

        if not (protein_id or locus_tag or gene):
            raise ValueError("CDSFeature requires at least one of: gene, protein_id, locus_tag")
        # semi-optional
        self.protein_id = _sanitise_id_value(protein_id)
        self.locus_tag = _sanitise_id_value(locus_tag)
        self.gene = _sanitise_id_value(gene)
        self.translation = str(translation)

        # optional
        if not isinstance(product, str):
            raise TypeError("product must be a string, not %s" % type(product))
        self.product = product
        self.transl_table = int(translation_table)
        self._sec_met = SecMetQualifier()
        self._nrps_pks = NRPSPKSQualifier(self.location.strand)

        self.motifs = []  # type: List[features.CDSMotif]

        # runtime-only data
        self.region = None  # type: Optional[features.Region]
        self.unique_id = None  # type: Optional[str] # set only when added to a record
예제 #4
0
    def from_biopython(bio_feature: SeqFeature, feature: "CDSFeature" = None,  # type: ignore
                       leftovers: Optional[Dict] = None, record: Any = None) -> "CDSFeature":
        if leftovers is None:
            leftovers = Feature.make_qualifiers_copy(bio_feature)
        # grab mandatory qualifiers
        transl_table = 1
        if record:
            transl_table = record.transl_table
        if "transl_table" in leftovers:
            transl_table = int(leftovers.pop("transl_table")[0])

        # semi-optional qualifiers
        protein_id = leftovers.pop("protein_id", [None])[0]
        locus_tag = leftovers.pop("locus_tag", [None])[0]
        gene = leftovers.pop("gene", [None])[0]
        if not (gene or protein_id or locus_tag):
            if "pseudo" in leftovers or "pseudogene" in leftovers:
                gene = "pseudo%s_%s"
            else:
                gene = "cds%s_%s"
            gene = gene % (bio_feature.location.start, bio_feature.location.end)
        name = locus_tag or protein_id or gene

        try:
            _verify_location(bio_feature.location)
        except Exception as err:
            message = "invalid location for %s: %s" % (name, str(err))
            raise SecmetInvalidInputError(message) from err

        try:
            translation = _ensure_valid_translation(leftovers.pop("translation", [""])[0],
                                                    bio_feature.location, transl_table, record)
        except ValueError as err:
            raise SecmetInvalidInputError(str(err) + ": %s" % name) from err

        feature = CDSFeature(bio_feature.location, translation, gene=gene,
                             locus_tag=locus_tag, protein_id=protein_id,
                             translation_table=transl_table)

        # grab optional qualifiers
        feature.product = leftovers.pop("product", [""])[0]
        sec_met = leftovers.pop("sec_met_domain", None)
        if sec_met:
            feature.sec_met = SecMetQualifier.from_biopython(sec_met)
        gene_functions = leftovers.pop("gene_functions", [])
        if gene_functions:
            feature.gene_functions.add_from_qualifier(gene_functions)
        feature.nrps_pks.add_from_qualifier(leftovers.pop("NRPS_PKS", []))

        # grab parent optional qualifiers
        super(CDSFeature, feature).from_biopython(bio_feature, feature=feature, leftovers=leftovers)

        return feature
    def annotate(self, tool: str) -> None:
        """ Annotates a CDSFeature with the results gathered """
        all_matching = set()
        if not self.cds.sec_met:
            self.cds.sec_met = SecMetQualifier(self.domains)
        else:
            all_matching.update(set(self.cds.sec_met.domain_ids))
            self.cds.sec_met.add_domains(self.domains)
        for cluster_type, matching_domains in self.definition_domains.items():
            all_matching.update(matching_domains)
            for domain in matching_domains:
                self.cds.gene_functions.add(GeneFunction.CORE, tool, domain, cluster_type)

        # and add all detected domains as ADDITIONAL if not CORE
        for secmet_domain in self.cds.sec_met.domains:
            if secmet_domain.name in all_matching:
                continue
            self.cds.gene_functions.add(GeneFunction.ADDITIONAL, secmet_domain.tool,
                                        secmet_domain.name)
예제 #6
0
def strip_record(record: Record) -> None:
    """ Discard antismash specific features and feature qualifiers """
    logging.debug(
        "Stripping antiSMASH features and annotations from record: %s",
        record.id)
    record.clear_clusters()
    record.clear_superclusters()
    record.clear_subregions()
    record.clear_regions()
    record.clear_antismash_domains()
    record.clear_pfam_domains()

    # clean up antiSMASH-created CDSMotifs, but leave the rest
    motifs = list(record.get_cds_motifs())
    record.clear_cds_motifs()
    for motif in motifs:
        if not motif.created_by_antismash:
            record.add_cds_motif(motif)

    # clean up antiSMASH annotations in CDS features
    for feature in record.get_cds_features():
        feature.sec_met = SecMetQualifier()
        feature.gene_functions.clear()
예제 #7
0
 def test_with_no_secmet(self):
     cds = self.create_cds(55000, 60000, profiles=[])
     cds.sec_met = SecMetQualifier()
     self.record.add_cds_feature(cds)
     assert utils.distance_to_pfam(self.record, self.query, ["test"]) == -1
 def get_domains_for_cds(cds: CDSFeature) -> List[SecMetQualifier.Domain]:
     domains = []
     for hsp in results_by_id.get(cds.get_name(), []):
         domains.append(SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore,
                                               num_seeds_per_hmm[hsp.query_id], tool))
     return domains
예제 #9
0
 def strip_antismash_annotations(self) -> None:
     """ Remove all antiSMASH-specific annotations from the feature """
     self.sec_met = SecMetQualifier()
     self.gene_functions.clear()
     self.nrps_pks = NRPSPKSQualifier(self.location.strand)
예제 #10
0
    def from_biopython(bio_feature: SeqFeature, feature: "CDSFeature" = None,  # type: ignore
                       leftovers: Optional[Dict] = None, record: Any = None) -> "CDSFeature":
        if leftovers is None:
            leftovers = Feature.make_qualifiers_copy(bio_feature)
        # grab mandatory qualifiers
        transl_table = 1
        if record:
            transl_table = record.transl_table
        if "transl_table" in leftovers:
            transl_table = int(leftovers.pop("transl_table")[0])
        translation = leftovers.pop("translation", [""])[0]

        # semi-optional qualifiers
        protein_id = leftovers.pop("protein_id", [None])[0]
        locus_tag = leftovers.pop("locus_tag", [None])[0]
        gene = leftovers.pop("gene", [None])[0]
        if not (gene or protein_id or locus_tag):
            if "pseudo" in leftovers or "pseudogene" in leftovers:
                gene = "pseudo%s_%s"
            else:
                gene = "cds%s_%s"
            gene = gene % (bio_feature.location.start, bio_feature.location.end)

        try:
            _verify_location(bio_feature.location)
        except Exception as err:
            message = "invalid location for %s: %s" % (gene or protein_id or locus_tag, str(err))
            raise SecmetInvalidInputError(message) from err

        # ensure translation is valid if it exists
        if translation:
            invalid = set(translation) - _VALID_TRANSLATION_CHARS
            if invalid:
                logging.warning("Regenerating translation for CDS %s (at %s) containing invalid characters: %s",
                                locus_tag or protein_id or gene, bio_feature.location, invalid)
                translation = ""
        # ensure that the translation fits
        if not _is_valid_translation_length(translation, bio_feature.location):
            raise SecmetInvalidInputError("translation longer than location allows: %s > %s" % (
                                len(translation) * 3, len(bio_feature.location)))
        # finally, generate the translation if it doesn't exist
        if not translation:
            if not record:
                raise SecmetInvalidInputError("no translation in CDS and no record to generate it with")
            if bio_feature.location.end > len(record.seq):
                raise SecmetInvalidInputError("feature missing translation and sequence too short: %s" % (
                                              (gene or protein_id or locus_tag)))
            translation = record.get_aa_translation_from_location(bio_feature.location, transl_table)

        assert _is_valid_translation_length(translation, bio_feature.location)

        feature = CDSFeature(bio_feature.location, translation, gene=gene,
                             locus_tag=locus_tag, protein_id=protein_id,
                             translation_table=transl_table)

        # grab optional qualifiers
        feature.product = leftovers.pop("product", [""])[0]
        sec_met = leftovers.pop("sec_met_domain", None)
        if sec_met:
            feature.sec_met = SecMetQualifier.from_biopython(sec_met)
        gene_functions = leftovers.pop("gene_functions", [])
        if gene_functions:
            feature.gene_functions.add_from_qualifier(gene_functions)

        # grab parent optional qualifiers
        super(CDSFeature, feature).from_biopython(bio_feature, feature=feature, leftovers=leftovers)

        return feature
예제 #11
0
def detect_protoclusters_and_signatures(record: Record, signature_file: str,
                                        seeds_file: str, rule_files: List[str],
                                        filter_file: str,
                                        tool: str) -> RuleDetectionResults:
    """ Compares all CDS features in a record with HMM signatures and generates
        Protocluster features based on those hits and the current protocluster detection
        rules.

        Arguments:
            record: the record to analyse
            signature_file: a tab separated file; each row being a single HMM reference
                        with columns: label, description, minimum score cutoff, hmm path
            seeds_file: the file containing all HMM profiles
            rule_files: the files containing the rules to use for cluster definition
            filter_file: a file containing equivalence sets of HMMs
            tool: the name of the tool providing the HMMs (e.g. clusterfinder, rule_based_clusters)
    """
    if not rule_files:
        raise ValueError("rules must be provided")
    full_fasta = fasta.get_fasta_from_record(record)
    # if there's no CDS features, don't try to do anything
    if not full_fasta:
        return RuleDetectionResults({}, tool)
    sig_by_name = {
        sig.name: sig
        for sig in get_signature_profiles(signature_file)
    }
    rules = []  # type: List[rule_parser.DetectionRule]
    for rule_file in rule_files:
        rules = create_rules(rule_file, set(sig_by_name), rules)
    results = []
    results_by_id = {}  # type: Dict[str, HSP]

    runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True)
    for runresult in runresults:
        acc = runresult.accession.split('.')[0]
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.query_id in sig_by_name:
                sig = sig_by_name[hsp.query_id]
            elif acc in sig_by_name:
                sig = sig_by_name[acc]
            else:
                raise ValueError(
                    'Failed to find signature for ID %s / ACC %s' %
                    (hsp.query_id, acc))
            if hsp.bitscore > sig.cutoff:
                results.append(hsp)
                if hsp.hit_id not in results_by_id:
                    results_by_id[hsp.hit_id] = [hsp]
                else:
                    results_by_id[hsp.hit_id].append(hsp)

    # Filter results by comparing scores of different models (for PKS systems)
    results, results_by_id = filter_results(results, results_by_id,
                                            filter_file, set(sig_by_name))

    # Filter multiple results of the same model in one gene
    results, results_by_id = filter_result_multiple(results, results_by_id)

    # Use rules to determine gene clusters
    cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules(
        record, results_by_id, rules)

    # Find number of sequences on which each pHMM is based
    num_seeds_per_hmm = get_sequence_counts(signature_file)

    # Save final results to record
    rules_by_name = {rule.name: rule for rule in rules}
    clusters = find_protoclusters(record, cluster_type_hits, rules_by_name)
    strip_inferior_domains(cds_domains_by_cluster, rules_by_name)

    cds_results_by_cluster = {}
    for cluster in clusters:
        cds_results = []
        for cds in record.get_cds_features_within_location(cluster.location):
            domains = []
            for hsp in results_by_id.get(cds.get_name(), []):
                domains.append(
                    SecMetQualifier.Domain(hsp.query_id, hsp.evalue,
                                           hsp.bitscore,
                                           num_seeds_per_hmm[hsp.query_id],
                                           tool))
            if domains:
                cds_results.append(
                    CDSResults(cds, domains,
                               cds_domains_by_cluster.get(cds.get_name(), {})))
        cds_results_by_cluster[cluster] = cds_results

    return RuleDetectionResults(cds_results_by_cluster, tool)