def annotate_domains(self, record: Record, cds: CDSFeature) -> None: """ Adds domain annotations to CDSFeatures and creates AntismashDomain features for all domains found """ if not self.domain_hmms: return cds.nrps_pks.type = self.type # generate AntismashDomain features domain_features = generate_domain_features(record, cds, self.domain_hmms) for domain, domain_feature in domain_features.items(): record.add_antismash_domain(domain_feature) # update the CDS' NRPS_PKS qualifier cds.nrps_pks.add_domain(domain, domain_feature.get_name()) # construct CDSMotif features if not self.motif_hmms: return motif_features = generate_motif_features(record, cds, self.motif_hmms) for motif in motif_features: record.add_cds_motif(motif) cds.motifs.extend(motif_features)
def add_to_record(self, record: Record) -> None: # track multi-CDS modules to avoid duplication added_modules = set() for cds, result in self.cds_results.items(): for module in result.modules: if module in added_modules: continue added_modules.add(module) domains: List[AntismashDomain] = [] for component in module: if component.locus == cds.get_name(): domain = result.domain_features[component.domain] else: other_cds_results = self.cds_results[ record.get_cds_by_name(component.locus)] domain = other_cds_results.domain_features[ component.domain] domains.append(domain) mod_type = ModuleFeature.types.UNKNOWN if module.is_nrps(): mod_type = ModuleFeature.types.NRPS elif module.is_pks(): mod_type = ModuleFeature.types.PKS feature = ModuleFeature(domains, mod_type, complete=module.is_complete(), starter=module.is_starter_module(), final=module.is_termination_module(), iterative=module.is_iterative()) record.add_module(feature)
def test_cluster_numbering(self): record = Record(Seq("A"*1000)) for start, end in [(50, 100), (10, 40), (700, 1000), (0, 9)]: cluster = helpers.DummyCluster(start, end) record.add_cluster(cluster) for i, cluster in enumerate(sorted(list(record.get_clusters()))): assert cluster.get_cluster_number() == i + 1
def annotate_domains(self, record: Record, cds: CDSFeature) -> None: """ Adds domain annotations to CDSFeatures and creates ModularDomain features for all domains found """ if not self.domain_hmms: return cds.nrps_pks.type = self.type # generate domain features self.domain_features = generate_domain_features(cds, self.domain_hmms) ks_sub = iter(self.ks_subtypes) for domain, domain_feature in self.domain_features.items(): if domain.hit_id == "PKS_KS": sub = next(ks_sub) else: sub = "" record.add_antismash_domain(domain_feature) # update the CDS' NRPS_PKS qualifier cds.nrps_pks.add_domain(domain, domain_feature.get_name(), sub) # construct CDSMotif features if not self.motif_hmms: return motif_features = generate_motif_features(cds, self.motif_hmms) for motif in motif_features: record.add_cds_motif(motif) cds.motifs.extend(motif_features)
def test_blank_records(self): blank_no_pfams = DummyRecord() blank_no_ids = Record(Seq("ATGTTATGAGGGTCATAACAT", generic_dna)) fake_pfam = DummyPFAMDomain(identifier="PF00000") blank_no_ids.add_pfam_domain(fake_pfam) assert not pfam2go.get_gos_for_pfams(blank_no_pfams) assert not pfam2go.get_gos_for_pfams(blank_no_ids)
def setUp(self): self.record = Record(Seq("A" * 1000)) self.start = 100 self.end = 900 self.cluster = self.create_cluster(self.start, self.end) self.record.add_cluster(self.cluster) assert self.cluster.location.start == self.start assert self.cluster.location.end == self.end
def test_bridge_in_linear_record(self): self.seqrec.annotations["topology"] = "linear" self.seqrec.features.append(self.seqcds) with self.assertRaisesRegex(ValueError, "Features that bridge"): Record.from_biopython(self.seqrec, taxon='bacteria') self.seqrec.features[0] = self.seqgene with self.assertRaisesRegex(ValueError, "Features that bridge"): Record.from_biopython(self.seqrec, taxon='bacteria')
def test_orphaned_cluster_number(self): record = Record(Seq("A" * 1000)) cluster = helpers.DummyCluster(0, 1000) with self.assertRaisesRegex(ValueError, "Cluster not contained in record"): print(record.get_cluster_number(cluster)) with self.assertRaisesRegex(ValueError, "Cluster not contained in record"): print(cluster.get_cluster_number())
def test_bridge_in_linear_record(self): self.seqrec.annotations["topology"] = "linear" self.seqrec.features.append(self.seqcds) with self.assertRaisesRegex(SecmetInvalidInputError, "cannot determine correct exon ordering"): Record.from_biopython(self.seqrec, taxon='fungi') self.seqrec.features[0] = self.seqgene with self.assertRaisesRegex(SecmetInvalidInputError, "cannot determine correct exon ordering"): Record.from_biopython(self.seqrec, taxon='fungi')
def add_to_record(self, record: Record) -> None: for result in self.cds_results.values(): for module in result.modules: domains = [result.domain_features[component.domain] for component in module] mod_type = ModuleFeature.types.UNKNOWN if module.is_nrps(): mod_type = ModuleFeature.types.NRPS elif module.is_pks(): mod_type = ModuleFeature.types.PKS feature = ModuleFeature(domains, mod_type, complete=module.is_complete(), starter=module.is_starter_module(), final=module.is_termination_module(), iterative=module.is_iterative()) record.add_module(feature)
def test_overlapping_clusters(self): record = Record(seq="A"*40) record.add_cluster(Cluster(FeatureLocation(10, 40), 0, 0, [])) with self.assertRaises(ValueError): record.add_cluster(Cluster(FeatureLocation(0, 11), 0, 0, [])) # ok, since ends aren't inclusive record.add_cluster(Cluster(FeatureLocation(0, 10), 0, 0, []))
def test_blank_records(self): blank_no_pfams = DummyRecord() blank_no_ids = Record(Seq("ATGTTATGAGGGTCATAACAT", generic_dna)) fake_pfam_location = FeatureLocation(0, 12) fake_pfam = PFAMDomain(location=fake_pfam_location, description='MCPsignal', protein_start=0, protein_end=5, identifier="PF00000", tool="test") fake_pfam.domain_id = 'BLANK' blank_no_ids.add_pfam_domain(fake_pfam) assert not pfam2go.get_gos_for_pfams(blank_no_pfams) assert not pfam2go.get_gos_for_pfams(blank_no_ids)
def generate_domains(record: Record) -> NRPSPKSDomains: """ Annotates NRPS/PKS domains on CDS features. The `nrps_pks` member of each feature will be updated, along with creating CDSMotif features when relevant. Arguments: record: the secmet.Record of which to annotate CDS features Returns: a NRPSPKSDomains instance containing all found motifs and domain HMMs for each CDS """ results = NRPSPKSDomains(record.id) cds_within_clusters = record.get_cds_features_within_clusters() assert cds_within_clusters # because every cluster should have genes fasta = get_fasta_from_features(cds_within_clusters) cds_domains = find_domains(fasta, record) cds_motifs = find_ab_motifs(fasta) for cds in cds_within_clusters: domains = cds_domains.get(cds.get_name(), []) motifs = cds_motifs.get(cds.get_name(), []) if not (domains or motifs): continue domain_type = classify_cds([domain.hit_id for domain in domains]) results.cds_results[cds] = CDSResult(domains, motifs, domain_type) for cds, cds_result in results.cds_results.items(): cds_result.annotate_domains(record, cds) results.added = True return results
def from_json(json: Dict[str, Any], record: Record) -> Optional["Pfam2GoResults"]: """ Constructs a new Pfam2GoResults instance from a json format and the original record analysed. Arguments: json: JSON representation of Pfam2GoResults record: Record analysed Returns: A Pfam2GoResults instance constructed from the record and the JSON """ if json["schema_version"] != Pfam2GoResults.schema_version: logging.warning( "Schema version mismatch, discarding Pfam2GO results") return None all_pfam_ids_to_ontologies = defaultdict( list) # type: Dict[PFAMDomain, List[GeneOntologies]] for domain in record.get_pfam_domains(): for pfam_id in domain.db_xref: id_without_version = pfam_id.partition('.')[0] if id_without_version in json["pfams"]: all_ontology = [ GeneOntology(go_id, go_description) for go_id, go_description in json["pfams"] [id_without_version].items() ] all_pfam_ids_to_ontologies[domain].append( GeneOntologies(id_without_version, all_ontology)) results = Pfam2GoResults(record.id, all_pfam_ids_to_ontologies) return results
def get_gos_for_pfams( record: Record) -> Dict[PFAMDomain, List[GeneOntologies]]: """ Find Gene Ontology terms for a record's Pfam domains. Arguments: record: Record instance to annotate with Gene Ontology information Returns: A dictionary mapping a specific PFAMDomain instance to a list of GeneOntologies within the PFAMDomain. """ pfam_domains_with_gos = defaultdict( list) # type: Dict[PFAMDomain, List[GeneOntologies]] pfams = record.get_pfam_domains() full_gomap_as_ontologies = construct_mapping( path.get_full_path(__file__, 'data', 'pfam2go-march-2018.txt')) if not pfams: logging.debug( 'No Pfam domains found in record, cannot create Pfam to Gene Ontology mapping' ) for pfam in pfams: pfam_ids = pfam.db_xref if not pfam_ids: logging.debug( 'No Pfam ids found in Pfam domain %s, cannot create Pfam to Gene Ontology mapping', pfam) for pfam_id in pfam_ids: pfam_id = pfam_id.partition('.')[0] # strip out version number if not (len(pfam_id) == 7 and pfam_id[:2] == 'PF' and pfam_id[2:].isdecimal()): raise ValueError( 'Pfam id {} is not a valid Pfam id'.format(pfam_id)) gene_ontologies_for_pfam = full_gomap_as_ontologies.get(pfam_id) if gene_ontologies_for_pfam: pfam_domains_with_gos[pfam].append(gene_ontologies_for_pfam) return pfam_domains_with_gos
def test_record_conversion_from_biopython(self): before = list(Bio.SeqIO.parse(helpers.get_path_to_nisin_genbank(), "genbank"))[0] # sort notes, because direct comparisons otherwise are awful for feature in before.features: if "note" in feature.qualifiers: feature.qualifiers["note"] = sorted(feature.qualifiers["note"]) before_features = sorted(map(str, before.features)) type_counts = defaultdict(lambda: 0) for feature in before.features: type_counts[feature.type] += 1 record = Record.from_biopython(before, taxon="bacteria") after = record.to_biopython() # ensure new features are correct assert len(before_features) == len(after.features) for bef, aft in zip(before_features, sorted(map(str, after.features))): assert bef == aft # ensure we haven't changed the original record or feature list assert id(before) != id(after) assert id(before.features) != id(after.features) for i in range(len(before.features)): assert id(before.features[i]) != id(after.features[i]) for bef, aft in zip(before_features, sorted(map(str, before.features))): assert bef == aft # ensure that the counts of each match assert type_counts["CDS"] == len(record.get_cds_features()) assert type_counts["PFAM_domain"] == len(record.get_pfam_domains()) assert type_counts["cluster"] == len(record.get_clusters()) assert type_counts["aSDomain"] == len(record.get_antismash_domains())
def create_cluster_borders(anchor: str, clusters: List[ClusterPrediction], record: Record) -> List[ClusterBorder]: """ Create the predicted ClusterBorders """ if not clusters: return [] borders = [] for i, cluster in enumerate(clusters): # cluster borders returned by hmmdetect are based on CDS features # in contrast, cluster borders returned by cassis are based on gene features # --> hmmdetect derived clusters have exact loctions, like the CDSs have # --> cassis derived clusters may have fuzzy locations, like the genes have left_name = cluster.start.gene right_name = cluster.end.gene left = None right = None for gene in record.get_genes(): if gene.get_name() == left_name: left = gene if gene.get_name() == right_name: right = gene if left and right: break new_feature = SeqFeature(FeatureLocation(left.location.start, right.location.end), type="cluster_border") new_feature.qualifiers = { "aStool": ["cassis"], "anchor": [anchor], "abundance": [cluster.start.abundance + cluster.end.abundance], "motif_score": ["{:.1e}".format(cluster.start.score + cluster.end.score)], "gene_left": [cluster.start.gene], "promoter_left": [cluster.start.promoter], "abundance_left": [cluster.start.abundance], "motif_left": [cluster.start.pairing_string], "motif_score_left": ["{:.1e}".format(cluster.start.score)], "gene_right": [cluster.end.gene], "promoter_right": [cluster.end.promoter], "abundance_right": [cluster.end.abundance], "motif_right": [cluster.end.pairing_string], "motif_score_right": ["{:.1e}".format(cluster.end.score)], "genes": [cluster.genes], "promoters": [cluster.promoters], } if i == 0: new_feature.qualifiers["note"] = [ "best prediction (most abundant) for anchor gene {}".format( anchor) ] else: new_feature.qualifiers["note"] = [ "alternative prediction ({}) for anchor gene {}".format( i, anchor) ] new_feature = ClusterBorder.from_biopython(new_feature) borders.append(new_feature) return borders
def get_gos_for_pfams( record: Record) -> Dict[PFAMDomain, List[GeneOntologies]]: """ Find Gene Ontology terms for a record's Pfam domains. Arguments: record: Record instance to annotate with Gene Ontology information Returns: A dictionary mapping a specific PFAMDomain instance to a list of GeneOntologies within the PFAMDomain. """ pfam_domains_with_gos = defaultdict( list) # type: Dict[PFAMDomain, List[GeneOntologies]] pfams = record.get_pfam_domains() full_gomap_as_ontologies = construct_mapping( path.get_full_path(__file__, 'data', 'pfam2go-march-2018.txt')) if not pfams: logging.debug( 'No Pfam domains found in record, cannot create Pfam to Gene Ontology mapping' ) for pfam in pfams: gene_ontologies_for_pfam = full_gomap_as_ontologies.get( pfam.identifier) if gene_ontologies_for_pfam: pfam_domains_with_gos[pfam].append(gene_ontologies_for_pfam) return pfam_domains_with_gos
def test_blank_records(self): blank_no_pfams = DummyRecord() blank_no_ids = Record(Seq("ATGTTATGAGGGTCATAACAT", generic_dna)) fake_pfam_location = FeatureLocation(0, 12) fake_pfam = PFAMDomain(location=fake_pfam_location, description='MCPsignal', protein_start=0, protein_end=5) fake_pfam.domain_id = 'BLANK' blank_no_ids.add_pfam_domain(fake_pfam) with self.assertLogs(level='DEBUG') as log_cm: gos_for_no_pfams = pfam2go.get_gos_for_pfams(blank_no_pfams) assert 'No Pfam domains found' in str(log_cm.output) assert not gos_for_no_pfams gos_for_no_ids = pfam2go.get_gos_for_pfams(blank_no_ids) assert 'No Pfam ids found' in str(log_cm.output) assert not gos_for_no_ids
def test_cds_cluster_linkage(self): record = Record("A"*200) for start, end in [(50, 100), (10, 90), (0, 9), (150, 200)]: record.add_cds_feature(helpers.DummyCDS(start, end)) for start, end in [(10, 120), (5, 110), (10, 160), (45, 200)]: record.clear_clusters() cluster = helpers.DummyCluster(start, end) record.add_cluster(cluster) assert len(cluster.cds_children) == 2 for cds in cluster.cds_children: assert cds.overlaps_with(cluster)
def test_cds_with_no_id(self): self.seqrec.features.append(self.seqcds) rec = Record.from_biopython(self.seqrec, taxon="bacteria") cdses = rec.get_cds_features() assert len(cdses) == 2 assert cdses[0].location.start == 0 assert cdses[0].location.end == 9 assert cdses[0].get_name() == "bridge_LOWER" assert cdses[1].location.start == 12 assert cdses[1].location.end == 21 assert cdses[1].get_name() == "bridge_UPPER"
def store_promoters(promoters: Iterable[Promoter], record: Record) -> None: """Store information about promoter sequences to a SeqRecord""" logging.critical("adding promoters based on biopython features") for promoter in promoters: # remember to account for 0-indexed start location new_feature = SeqFeature(FeatureLocation(max(0, promoter.start - 1), promoter.end), type="promoter") new_feature.qualifiers = { "locus_tag": promoter.get_gene_names( ), # already a list with one or two elements "seq": [str(promoter.seq)], # TODO save string or Seq object? } if isinstance(promoter, CombinedPromoter): new_feature.qualifiers["note"] = ["bidirectional promoter"] secmet_version = Feature.from_biopython(new_feature) secmet_version.created_by_antismash = True record.add_feature(secmet_version)
def setUp(self): # locations: VVV VVV record = Record(Seq("ATGTTATGAGGGTCATAACAT", generic_dna)) record.add_cds_feature(DummyCDS(0, 9, strand=1)) record.add_cds_feature(DummyCDS(12, 21, strand=-1)) cluster_loc = FeatureLocation(0, 21) cluster = Cluster(cluster_loc, 0, 0, []) record.add_cluster(cluster) # if these aren't correct, the tests will fail assert len(cluster.cds_children) == 2 for cds in record.get_cds_features(): assert cds.overlaps_with(cluster) assert cds.cluster == cluster, str(cds.location) assert cds.extract(record.seq) == "ATGTTATGA", str(cds.location) self.record = record
def from_json(json: Dict[str, Any], record: Record) -> Optional["NRPSPKSDomains"]: if NRPSPKSDomains.schema_version != json.get("schema_version"): logging.warning("Schema version mismatch, discarding NRPS PKS domain results") return None if record.id != json.get("record_id"): logging.warning("Record identifier mismatch, discarding NRPS PKS domain results") return None cds_results = {} for cds_name, cds_result in json["cds_results"].items(): cds = record.get_cds_by_name(cds_name) cds_result = CDSResult.from_json(cds_result) cds_result.annotate_domains(record, cds) cds_results[cds] = cds_result return NRPSPKSDomains(record.id, cds_results)
def generate_domains(record: Record) -> NRPSPKSDomains: """ Annotates NRPS/PKS domains on CDS features. The `nrps_pks` member of each feature will be updated, along with creating CDSMotif features when relevant. Arguments: record: the secmet.Record of which to annotate CDS features Returns: a NRPSPKSDomains instance containing all found motifs and domain HMMs for each CDS """ results = NRPSPKSDomains(record.id) cds_within_regions = record.get_cds_features_within_regions() assert cds_within_regions # because every cluster should have genes fasta = get_fasta_from_features(cds_within_regions) cds_domains = find_domains(fasta, record) cds_ks_subtypes = find_ks_domains(fasta) cds_motifs = find_ab_motifs(fasta) prev: Optional[CDSModuleInfo] = None for cds in cds_within_regions: domains = cds_domains.get(cds.get_name(), []) motifs = cds_motifs.get(cds.get_name(), []) if not (domains or motifs): continue subtype_names = match_subtypes_to_ks_domains( domains, cds_ks_subtypes.get(cds.get_name(), [])) domain_type = classify_cds([domain.hit_id for domain in domains], subtype_names) modules = build_modules_for_cds(domains, subtype_names, cds.get_name()) results.cds_results[cds] = CDSResult(domains, motifs, domain_type, modules, subtype_names) # combine modules that cross CDS boundaries, if possible and relevant info = CDSModuleInfo(cds, modules) if prev and prev.modules and info.modules: combine_modules( info, prev) # modifies the lists of modules linked in each CDSResult prev = info for cds, cds_result in results.cds_results.items(): cds_result.annotate_domains(record, cds) return results
def test_cds_split(self): self.seqrec.features.append(self.seqcds) print(self.seqcds) for id_name in ["locus_tag", "gene"]: self.seqcds.qualifiers[id_name] = ["test"] rec = Record.from_biopython(self.seqrec, taxon="bacteria") cdses = rec.get_cds_features() assert len(cdses) == 2 assert cdses[0].location.start == 0 assert cdses[0].location.end == 9 assert getattr(cdses[0], id_name) == "test_LOWER" assert cdses[0].get_name() == "test_LOWER" assert cdses[1].location.start == 12 assert cdses[1].location.end == 21 assert getattr(cdses[1], id_name) == "test_UPPER" assert cdses[1].get_name() == "test_UPPER" self.seqcds.qualifiers.pop(id_name)
def get_anchor_gene_names(record: Record) -> List[str]: """ Finds all gene names that have a CDS with secondary metabolite annotations. Requires that a CDS.get_name() returns the same name of its parent Gene.get_name() Arguments: record: the record to search Returns: a list of gene names """ anchor_genes = [] for feature in record.get_cds_features(): if feature.gene_function == GeneFunction.CORE: anchor_genes.append(feature.get_name()) return anchor_genes
def filter_nonterminal_docking_domains(record: Record, cds_domains: Dict[str, List[HMMResult]] ) -> Dict[str, List[HMMResult]]: """ For multiprotein domains, remove all docking terminal predictions that aren't overlapping with the first or last 50 amino acids of the protein. """ dockingdomains = {'NRPS-COM_Nterm', 'NRPS-COM_Cterm', 'PKS_Docking_Cterm', 'PKS_Docking_Nterm'} feature_by_id = record.get_cds_name_mapping() results = {} for cds_name in list(cds_domains): new = [] cds_length = len(feature_by_id[cds_name].translation) for hit in cds_domains[cds_name]: if hit.hit_id in dockingdomains and \ not (cds_length - max(hit.query_start, hit.query_end) < 50 or min(hit.query_start, hit.query_end) < 50): continue new.append(hit) if new: results[cds_name] = new return results
def test_gene_split(self): self.seqrec.features.append(self.seqgene) for id_name in ["locus_tag", "gene"]: self.seqgene.qualifiers[id_name] = [id_name + "_test"] expected = id_name + "_test" rec = Record.from_biopython(self.seqrec, taxon="bacteria") self.seqgene.qualifiers.pop(id_name) genes = rec.get_genes() assert len(genes) == 2 if id_name == "gene": id_name = "gene_name" # since a Gene doesn't have a gene member assert genes[0].location.start == 12 assert genes[0].location.end == 21 assert getattr(genes[0], id_name) == expected + "_UPPER" assert genes[0].get_name() == expected + "_UPPER" assert genes[1].location.start == 0 assert genes[1].location.end == 9 assert getattr(genes[1], id_name) == expected + "_LOWER" assert genes[1].get_name() == expected + "_LOWER"
def setUp(self): # locations: VVV VVV record = Record(Seq("ATGTTATGAGGGTCATAACAT")) record.add_cds_feature(DummyCDS(0, 9, strand=1)) record.add_cds_feature(DummyCDS(12, 21, strand=-1)) cluster = DummyProtocluster(start=0, end=21) record.add_protocluster(cluster) record.create_candidate_clusters() record.create_regions() # if these aren't correct, the tests will fail assert len(cluster.cds_children) == 2 assert len(record.get_regions()) == 1 for cds in record.get_cds_features(): assert cds.is_contained_by(cluster) assert cds.extract(record.seq) == "ATGTTATGA", str(cds.location) self.record = record