def test_with_secmet(self): domains = [ SecMetQualifier.Domain("testA", 0.1, 1.1, 3, "test"), SecMetQualifier.Domain("testB", 5.1, 3.9, 5, "dummy") ] self.cds.sec_met = SecMetQualifier(domains) bio = self.convert() assert "sec_met" not in bio.qualifiers # again, detecting leftover legacy versions assert len(bio.qualifiers["sec_met_domain"]) == 2 assert bio.qualifiers["sec_met_domain"] == list(map(str, domains)) regen = CDSFeature.from_biopython(bio) assert regen.sec_met assert len(regen.sec_met.domains) == len(domains) assert regen.sec_met.domains == domains
def annotate_orfs(cds_features: List[secmet.CDSFeature], hmm_results: Dict[str, List[HSP]]) -> None: """ Annotates newly found ORFs with sactipeptide domain information. This is only relevant for CDS features that did not exist during the cluster detection stage of antiSMASH. """ domains_by_feature: Dict[str, List[SecMetQualifier.Domain]] = defaultdict(list) for hit_id, results in hmm_results.items(): for result in results: domain = SecMetQualifier.Domain(result.query_id, result.evalue, result.bitscore, 0, "sactipeptides") domains_by_feature[hit_id].append(domain) for cds in cds_features: domains = domains_by_feature[cds.get_name()] if domains: cds.sec_met = SecMetQualifier(domains)
def __init__(self, location: Location, translation: str, locus_tag: str = None, protein_id: str = None, product: str = "", gene: str = None, translation_table: int = 1) -> None: super().__init__(location, feature_type="CDS") _verify_location(location) # mandatory self._gene_functions = GeneFunctionAnnotations() if not (protein_id or locus_tag or gene): raise ValueError("CDSFeature requires at least one of: gene, protein_id, locus_tag") # semi-optional self.protein_id = _sanitise_id_value(protein_id) self.locus_tag = _sanitise_id_value(locus_tag) self.gene = _sanitise_id_value(gene) self.translation = str(translation) # optional if not isinstance(product, str): raise TypeError("product must be a string, not %s" % type(product)) self.product = product self.transl_table = int(translation_table) self._sec_met = SecMetQualifier() self._nrps_pks = NRPSPKSQualifier(self.location.strand) self.motifs = [] # type: List[features.CDSMotif] # runtime-only data self.region = None # type: Optional[features.Region] self.unique_id = None # type: Optional[str] # set only when added to a record
def from_biopython(bio_feature: SeqFeature, feature: "CDSFeature" = None, # type: ignore leftovers: Optional[Dict] = None, record: Any = None) -> "CDSFeature": if leftovers is None: leftovers = Feature.make_qualifiers_copy(bio_feature) # grab mandatory qualifiers transl_table = 1 if record: transl_table = record.transl_table if "transl_table" in leftovers: transl_table = int(leftovers.pop("transl_table")[0]) # semi-optional qualifiers protein_id = leftovers.pop("protein_id", [None])[0] locus_tag = leftovers.pop("locus_tag", [None])[0] gene = leftovers.pop("gene", [None])[0] if not (gene or protein_id or locus_tag): if "pseudo" in leftovers or "pseudogene" in leftovers: gene = "pseudo%s_%s" else: gene = "cds%s_%s" gene = gene % (bio_feature.location.start, bio_feature.location.end) name = locus_tag or protein_id or gene try: _verify_location(bio_feature.location) except Exception as err: message = "invalid location for %s: %s" % (name, str(err)) raise SecmetInvalidInputError(message) from err try: translation = _ensure_valid_translation(leftovers.pop("translation", [""])[0], bio_feature.location, transl_table, record) except ValueError as err: raise SecmetInvalidInputError(str(err) + ": %s" % name) from err feature = CDSFeature(bio_feature.location, translation, gene=gene, locus_tag=locus_tag, protein_id=protein_id, translation_table=transl_table) # grab optional qualifiers feature.product = leftovers.pop("product", [""])[0] sec_met = leftovers.pop("sec_met_domain", None) if sec_met: feature.sec_met = SecMetQualifier.from_biopython(sec_met) gene_functions = leftovers.pop("gene_functions", []) if gene_functions: feature.gene_functions.add_from_qualifier(gene_functions) feature.nrps_pks.add_from_qualifier(leftovers.pop("NRPS_PKS", [])) # grab parent optional qualifiers super(CDSFeature, feature).from_biopython(bio_feature, feature=feature, leftovers=leftovers) return feature
def annotate(self, tool: str) -> None: """ Annotates a CDSFeature with the results gathered """ all_matching = set() if not self.cds.sec_met: self.cds.sec_met = SecMetQualifier(self.domains) else: all_matching.update(set(self.cds.sec_met.domain_ids)) self.cds.sec_met.add_domains(self.domains) for cluster_type, matching_domains in self.definition_domains.items(): all_matching.update(matching_domains) for domain in matching_domains: self.cds.gene_functions.add(GeneFunction.CORE, tool, domain, cluster_type) # and add all detected domains as ADDITIONAL if not CORE for secmet_domain in self.cds.sec_met.domains: if secmet_domain.name in all_matching: continue self.cds.gene_functions.add(GeneFunction.ADDITIONAL, secmet_domain.tool, secmet_domain.name)
def strip_record(record: Record) -> None: """ Discard antismash specific features and feature qualifiers """ logging.debug( "Stripping antiSMASH features and annotations from record: %s", record.id) record.clear_clusters() record.clear_superclusters() record.clear_subregions() record.clear_regions() record.clear_antismash_domains() record.clear_pfam_domains() # clean up antiSMASH-created CDSMotifs, but leave the rest motifs = list(record.get_cds_motifs()) record.clear_cds_motifs() for motif in motifs: if not motif.created_by_antismash: record.add_cds_motif(motif) # clean up antiSMASH annotations in CDS features for feature in record.get_cds_features(): feature.sec_met = SecMetQualifier() feature.gene_functions.clear()
def test_with_no_secmet(self): cds = self.create_cds(55000, 60000, profiles=[]) cds.sec_met = SecMetQualifier() self.record.add_cds_feature(cds) assert utils.distance_to_pfam(self.record, self.query, ["test"]) == -1
def get_domains_for_cds(cds: CDSFeature) -> List[SecMetQualifier.Domain]: domains = [] for hsp in results_by_id.get(cds.get_name(), []): domains.append(SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore, num_seeds_per_hmm[hsp.query_id], tool)) return domains
def strip_antismash_annotations(self) -> None: """ Remove all antiSMASH-specific annotations from the feature """ self.sec_met = SecMetQualifier() self.gene_functions.clear() self.nrps_pks = NRPSPKSQualifier(self.location.strand)
def from_biopython(bio_feature: SeqFeature, feature: "CDSFeature" = None, # type: ignore leftovers: Optional[Dict] = None, record: Any = None) -> "CDSFeature": if leftovers is None: leftovers = Feature.make_qualifiers_copy(bio_feature) # grab mandatory qualifiers transl_table = 1 if record: transl_table = record.transl_table if "transl_table" in leftovers: transl_table = int(leftovers.pop("transl_table")[0]) translation = leftovers.pop("translation", [""])[0] # semi-optional qualifiers protein_id = leftovers.pop("protein_id", [None])[0] locus_tag = leftovers.pop("locus_tag", [None])[0] gene = leftovers.pop("gene", [None])[0] if not (gene or protein_id or locus_tag): if "pseudo" in leftovers or "pseudogene" in leftovers: gene = "pseudo%s_%s" else: gene = "cds%s_%s" gene = gene % (bio_feature.location.start, bio_feature.location.end) try: _verify_location(bio_feature.location) except Exception as err: message = "invalid location for %s: %s" % (gene or protein_id or locus_tag, str(err)) raise SecmetInvalidInputError(message) from err # ensure translation is valid if it exists if translation: invalid = set(translation) - _VALID_TRANSLATION_CHARS if invalid: logging.warning("Regenerating translation for CDS %s (at %s) containing invalid characters: %s", locus_tag or protein_id or gene, bio_feature.location, invalid) translation = "" # ensure that the translation fits if not _is_valid_translation_length(translation, bio_feature.location): raise SecmetInvalidInputError("translation longer than location allows: %s > %s" % ( len(translation) * 3, len(bio_feature.location))) # finally, generate the translation if it doesn't exist if not translation: if not record: raise SecmetInvalidInputError("no translation in CDS and no record to generate it with") if bio_feature.location.end > len(record.seq): raise SecmetInvalidInputError("feature missing translation and sequence too short: %s" % ( (gene or protein_id or locus_tag))) translation = record.get_aa_translation_from_location(bio_feature.location, transl_table) assert _is_valid_translation_length(translation, bio_feature.location) feature = CDSFeature(bio_feature.location, translation, gene=gene, locus_tag=locus_tag, protein_id=protein_id, translation_table=transl_table) # grab optional qualifiers feature.product = leftovers.pop("product", [""])[0] sec_met = leftovers.pop("sec_met_domain", None) if sec_met: feature.sec_met = SecMetQualifier.from_biopython(sec_met) gene_functions = leftovers.pop("gene_functions", []) if gene_functions: feature.gene_functions.add_from_qualifier(gene_functions) # grab parent optional qualifiers super(CDSFeature, feature).from_biopython(bio_feature, feature=feature, leftovers=leftovers) return feature
def detect_protoclusters_and_signatures(record: Record, signature_file: str, seeds_file: str, rule_files: List[str], filter_file: str, tool: str) -> RuleDetectionResults: """ Compares all CDS features in a record with HMM signatures and generates Protocluster features based on those hits and the current protocluster detection rules. Arguments: record: the record to analyse signature_file: a tab separated file; each row being a single HMM reference with columns: label, description, minimum score cutoff, hmm path seeds_file: the file containing all HMM profiles rule_files: the files containing the rules to use for cluster definition filter_file: a file containing equivalence sets of HMMs tool: the name of the tool providing the HMMs (e.g. clusterfinder, rule_based_clusters) """ if not rule_files: raise ValueError("rules must be provided") full_fasta = fasta.get_fasta_from_record(record) # if there's no CDS features, don't try to do anything if not full_fasta: return RuleDetectionResults({}, tool) sig_by_name = { sig.name: sig for sig in get_signature_profiles(signature_file) } rules = [] # type: List[rule_parser.DetectionRule] for rule_file in rule_files: rules = create_rules(rule_file, set(sig_by_name), rules) results = [] results_by_id = {} # type: Dict[str, HSP] runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True) for runresult in runresults: acc = runresult.accession.split('.')[0] # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.query_id in sig_by_name: sig = sig_by_name[hsp.query_id] elif acc in sig_by_name: sig = sig_by_name[acc] else: raise ValueError( 'Failed to find signature for ID %s / ACC %s' % (hsp.query_id, acc)) if hsp.bitscore > sig.cutoff: results.append(hsp) if hsp.hit_id not in results_by_id: results_by_id[hsp.hit_id] = [hsp] else: results_by_id[hsp.hit_id].append(hsp) # Filter results by comparing scores of different models (for PKS systems) results, results_by_id = filter_results(results, results_by_id, filter_file, set(sig_by_name)) # Filter multiple results of the same model in one gene results, results_by_id = filter_result_multiple(results, results_by_id) # Use rules to determine gene clusters cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules( record, results_by_id, rules) # Find number of sequences on which each pHMM is based num_seeds_per_hmm = get_sequence_counts(signature_file) # Save final results to record rules_by_name = {rule.name: rule for rule in rules} clusters = find_protoclusters(record, cluster_type_hits, rules_by_name) strip_inferior_domains(cds_domains_by_cluster, rules_by_name) cds_results_by_cluster = {} for cluster in clusters: cds_results = [] for cds in record.get_cds_features_within_location(cluster.location): domains = [] for hsp in results_by_id.get(cds.get_name(), []): domains.append( SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore, num_seeds_per_hmm[hsp.query_id], tool)) if domains: cds_results.append( CDSResults(cds, domains, cds_domains_by_cluster.get(cds.get_name(), {}))) cds_results_by_cluster[cluster] = cds_results return RuleDetectionResults(cds_results_by_cluster, tool)