def test_hybrid_interactions(self): cluster = create_cluster(3, 8, 171, 176, "a") hybrid = create_cluster(3, 8, 50, 55, "b") contained = create_cluster(80, 90, 100, 110, "c") # will form part of hybrid hybrid_cds = create_cds(8, 50, ["a", "b"]) cluster.add_cds(hybrid_cds) hybrid.add_cds(hybrid_cds) for overlapping in [ create_cluster(120, 130, 200, 250, "d"), create_cluster(60, 70, 200, 250, "d") ]: created = creator([cluster, hybrid, contained, overlapping]) assert len(created) == 2 assert created[0].location == FeatureLocation(3, 250) assert created[0].kind == CandidateCluster.kinds.INTERLEAVED assert created[0].protoclusters == tuple( sorted([cluster, hybrid, contained, overlapping])) assert created[1].location == FeatureLocation(3, 176) assert created[1].kind == CandidateCluster.kinds.CHEMICAL_HYBRID assert created[1].protoclusters == (cluster, hybrid, contained)
def test_prepeptide_adjustment(self): dummy_record = Record(Seq("A"*400, generic_dna)) subregion = DummySubRegion(start=100, end=300) dummy_record.add_subregion(subregion) region = Region(subregions=[subregion]) dummy_record.add_region(region) dummy_prepeptide = DummyFeature(200, 230, 1, "CDS_motif") # ensure both FeatureLocation and CompoundLocations are handled appropriately leader_loc = FeatureLocation(200, 210, 1) tail_loc = CompoundLocation([FeatureLocation(220, 223, -1), FeatureLocation(227, 230, -1)]) dummy_prepeptide._qualifiers["leader_location"] = [str(leader_loc)] dummy_prepeptide._qualifiers["tail_location"] = [str(tail_loc)] dummy_record.add_feature(dummy_prepeptide) # and add a CDS_motif without either qualifier (e.g. NRPS/PKS motif) to ensure that doesn't break dummy_record.add_feature(DummyFeature(250, 280, 1, "CDS_motif")) with NamedTemporaryFile(suffix=".gbk") as output: region.write_to_genbank(output.name) bio = list(seqio.parse(output.name))[0] assert len(bio.features) == 4 found = False for feature in bio.features: tail = feature.qualifiers.get("tail_location") leader = feature.qualifiers.get("leader_location") if tail and leader: # the part locations should now be adjusted backwards 100 bases assert leader == ["[100:110](+)"] assert tail == ["join{[120:123](-), [127:130](-)}"] found = True assert found, "prepeptide feature missing in conversion"
def test_conversion(self): core = FeatureLocation(8, 71, strand=1) surrounds = FeatureLocation(3, 76, strand=1) extras = {"a": ["5", "c"], "b": ["something"]} source = SideloadedProtocluster(core, surrounds, "tool name", "some-product", extra_qualifiers=extras) assert source.neighbourhood_range == 5 bio_features = source.to_biopython() assert len(bio_features) == 2 for key, val in extras.items(): assert bio_features[0].qualifiers[key] == val for regenerator in [SideloadedProtocluster, Protocluster]: dest = regenerator.from_biopython(bio_features[0]) assert isinstance(dest, SideloadedProtocluster) assert dest.extra_qualifiers == source.extra_qualifiers == extras assert dest.tool == source.tool assert dest.product == source.product assert dest.location == source.location assert dest.core_location == source.core_location assert dest.neighbourhood_range == source.neighbourhood_range for key, val in extras.items(): assert not dest.get_qualifier(key)
def create_cluster(start, end, product='a'): return Cluster(FeatureLocation(start, end), FeatureLocation(start, end), tool="testing", product=product, cutoff=1, neighbourhood_range=0, detection_rule="some rule text")
def create_cluster(): cluster = Cluster(FeatureLocation(8, 71, strand=1), FeatureLocation(3, 76, strand=1), tool="test", cutoff=17, neighbourhood_range=5, product='a', detection_rule="some rule text") return cluster
def create_cluster(n_start, start, end, n_end, product='a'): cluster = Cluster(FeatureLocation(start, end), FeatureLocation(n_start, n_end), tool="testing", product=product, cutoff=1, neighbourhood_range=0, detection_rule="some rule text") cds = create_cds(start, end, [product]) cluster.add_cds(cds) return cluster
def test_edge_overlap_before(self): cds = self.create_cds(9000, 10000, profiles=["l.edge"]) self.record.add_cds_feature(cds) assert utils.distance_to_pfam(self.record, self.query, ["l.edge"]) == -1 cds.location = FeatureLocation(9000, 10001, strand=1) assert utils.distance_to_pfam(self.record, self.query, ["l.edge"]) == 39999 cds.location = FeatureLocation(9000, 10001, strand=-1) assert utils.distance_to_pfam(self.record, self.query, ["l.edge"]) == 39999
def test_construction(self): loc = FeatureLocation(1, 15, 1) protein_location = FeatureLocation(0, 3) domain = Domain(loc, "test_type", tool="test", protein_location=protein_location, locus_tag="locus") assert domain.type == "test_type" assert domain.location == loc assert domain.created_by_antismash assert domain.tool == "test" assert domain.domain is None assert domain.protein_location == protein_location
def test_interleaving(self): # these first two hybrid clumps should be interleaved first_hybrid_clusters = [ create_cluster(30, 60, 120, 150, "a"), create_cluster(60, 90, 150, 180, "b") ] cds = create_cds(90, 120, ["a", "b"]) for cluster in first_hybrid_clusters: cluster.add_cds(cds) second_hybrid_clusters = [ create_cluster(90, 120, 250, 280, "c"), create_cluster(190, 220, 280, 310, "d") ] cds = create_cds(220, 250, ["c", "d"]) for cluster in second_hybrid_clusters: cluster.add_cds(cds) # this non-hybrid should also be included in the interleaved single = create_cluster(230, 250, 410, 430, "e") # this hybrid should not standalone = [ create_cluster(1000, 1100, 1400, 1500, "f"), create_cluster(1100, 1200, 1500, 1600, "g") ] cds = create_cds(1300, 1400, ["f", "g"]) for cluster in standalone: cluster.add_cds(cds) created = creator(first_hybrid_clusters + second_hybrid_clusters + [single] + standalone) assert len(created) == 4 assert created[0].location == FeatureLocation(30, 430) assert created[0].core_location == FeatureLocation(60, 410) assert created[0].kind == CandidateCluster.kinds.INTERLEAVED assert created[0].protoclusters == tuple(first_hybrid_clusters + second_hybrid_clusters + [single]) assert created[1].location == FeatureLocation(30, 180) assert created[1].protoclusters == tuple(first_hybrid_clusters) assert created[2].location == FeatureLocation(90, 310) assert created[2].protoclusters == tuple(second_hybrid_clusters) for cand in created[1:3]: assert cand.kind == CandidateCluster.kinds.CHEMICAL_HYBRID assert created[3].location == FeatureLocation(1000, 1600) assert created[3].kind == CandidateCluster.kinds.CHEMICAL_HYBRID
def test_interleaving_order(self): clusters = [ create_cluster(1000, 1100, 1400, 1500, "a"), create_cluster(1050, 2000, 3000, 4000, "b"), # sorts second due to neighbouring create_cluster(1100, 1200, 1500, 1600, "c") ] assert sorted(clusters) == clusters created = creator(clusters) assert len(created) == 3 assert created[0].kind == CandidateCluster.kinds.NEIGHBOURING assert created[0].location == FeatureLocation(1000, 4000) assert created[1].kind == CandidateCluster.kinds.INTERLEAVED assert created[1].location == FeatureLocation(1000, 1600) assert created[2].kind == CandidateCluster.kinds.SINGLE assert created[2].location == FeatureLocation(1050, 4000)
def create_subregions(anchor: str, cluster_preds: List[ClusterPrediction], record: Record) -> List[SubRegion]: """ Create the predicted subregions """ subregions = [] # type: List[SubRegion] if not cluster_preds: return subregions for i, cluster in enumerate(cluster_preds): # clusters returned by hmmdetect are based on CDS features # in contrast, subregions returned by cassis are based on gene features # --> hmmdetect derived clusters have exact loctions, like the CDSs have # --> cassis derived subregions may have fuzzy locations, like the genes have left_name = cluster.start.gene right_name = cluster.end.gene left = None right = None for gene in record.get_genes(): if gene.get_name() == left_name: left = gene if gene.get_name() == right_name: right = gene if left and right: break assert left and right, "boundary genes no longer present in Record" new_feature = SeqFeature(FeatureLocation(left.location.start, right.location.end), type="subregion") new_feature.qualifiers = { "aStool": ["cassis"], "anchor": [anchor], "abundance": [cluster.start.abundance + cluster.end.abundance], "motif_score": ["{:.1e}".format(cluster.start.score + cluster.end.score)], "gene_left": [cluster.start.gene], "promoter_left": [cluster.start.promoter], "abundance_left": [cluster.start.abundance], "motif_left": [cluster.start.pairing_string], "motif_score_left": ["{:.1e}".format(cluster.start.score)], "gene_right": [cluster.end.gene], "promoter_right": [cluster.end.promoter], "abundance_right": [cluster.end.abundance], "motif_right": [cluster.end.pairing_string], "motif_score_right": ["{:.1e}".format(cluster.end.score)], "genes": [cluster.genes], "promoters": [cluster.promoters], } if i == 0: new_feature.qualifiers["note"] = [ "best prediction (most abundant) for anchor gene {}".format( anchor) ] else: new_feature.qualifiers["note"] = [ "alternative prediction ({}) for anchor gene {}".format( i, anchor) ] new_feature = SubRegion.from_biopython(new_feature) subregions.append(new_feature) return subregions
def test_add_to_record(self): nisin = helpers.get_path_to_nisin_genbank() record = record_processing.parse_input_sequence(nisin)[0] assert not record.get_pfam_domains() # add a test PFAM pfam = PFAMDomain(FeatureLocation(2, 5), description="test", protein_start=5, protein_end=10, identifier="PF00005", domain="PF00005", tool="test") pfam.domain_id = "test" record.add_pfam_domain(pfam) assert len(record.get_pfam_domains()) == 1 # run pfam2go and add the results results = pfam2go.run_on_record(record, None, self.options) assert pfam in results.pfam_domains_with_gos assert not pfam.gene_ontologies results.add_to_record(record) assert pfam.gene_ontologies # check the contents of the annotation for domain in record.get_pfam_domains(): assert domain.gene_ontologies assert sorted(domain.gene_ontologies.ids) == sorted( results.get_all_gos(domain))
def test_angstrom(self): domain = AntismashDomain(FeatureLocation(1, 2), "test") domain.domain_id = "query" domain.translation = self.aligns[domain.domain_id].replace("-", "") sig = nrps_predictor.get_34_aa_signature(domain) assert sig == "L--SFDASLFEMYLLTGGDRNMYGPTEATMCATW"
def test_genbank(self): dummy_record = Record(Seq("A" * 100, generic_dna)) clusters = [ create_cluster(3, 20, "prodA"), create_cluster(25, 41, "prodB") ] for cluster in clusters: dummy_record.add_cluster(cluster) subregion = SubRegion(FeatureLocation(35, 71), "test", 0.7) dummy_record.add_subregion(subregion) supercluster = SuperCluster(SuperCluster.kinds.NEIGHBOURING, clusters) dummy_record.add_supercluster(supercluster) region = Region(superclusters=[supercluster], subregions=[subregion]) dummy_record.add_region(region) with NamedTemporaryFile(suffix=".gbk") as output: region.write_to_genbank(output.name) bio = list(seqio.parse(output.name)) assert len(bio) == 1 rec = Record.from_biopython(bio[0], taxon="bacteria") assert len(rec.get_regions()) == 1 new = rec.get_region(0) assert new.location.start == 3 - region.location.start assert new.location.end == 71 - region.location.start assert new.products == region.products assert new.probabilities == region.probabilities
def apply_cluster_rules(record: Record, results_by_id: Dict[str, List[HSP]], rules: List[rule_parser.DetectionRule] ) -> Tuple[Dict[str, Dict[str, Set[str]]], Dict[str, Set[str]]]: """ Run detection rules over each CDS and classify them if relevant. A CDS can satisfy multiple rules. If so, all rules satisfied will form part of the type string, separated by '-'. The 'other' type has a lower precedence than other rules and a hit with the 'other' rule will be ignored if another rule is also satisfied. Args: record: the record being checked results_by_id: A dict of CDS ID to a list of HSP results rules: A list of DetectionRule instances Returns: A tuple of a dictionary mapping CDS ID to a dictionary mapping cluster type string to a set of domains used to determine the cluster and a dictionary mapping rule name to a set of CDS feature names that matched the rule """ if not results_by_id: return {}, {} cds_with_hits = sorted(results_by_id, key=lambda gene_id: record.get_cds_by_name(gene_id).location.start) cds_domains_by_cluster_type = {} cluster_type_hits = defaultdict(set) # type: Dict[str, Set[str]] for cds_name in cds_with_hits: feature = record.get_cds_by_name(cds_name) feature_start, feature_end = sorted([feature.location.start, feature.location.end]) results = [] # type: List[str] rule_texts = [] info_by_range = {} # type: Dict[int, Tuple[Dict[str, CDSFeature], Dict[str, List[HSP]]]] domain_matches = set() # type: Set[str] domains_by_cluster = {} # type: Dict[str, Set[str]] for rule in rules: if rule.cutoff not in info_by_range: location = FeatureLocation(feature_start - rule.cutoff, feature_end + rule.cutoff) nearby = record.get_cds_features_within_location(location, with_overlapping=True) nearby_features = {neighbour.get_name(): neighbour for neighbour in nearby} nearby_results = {neighbour: results_by_id[neighbour] for neighbour in nearby_features if neighbour in results_by_id} info_by_range[rule.cutoff] = (nearby_features, nearby_results) nearby_features, nearby_results = info_by_range[rule.cutoff] matching = rule.detect(cds_name, nearby_features, nearby_results) if matching.met and matching.matches: domains_by_cluster[rule.name] = matching.matches results.append(rule.name) rule_texts.append(rule.reconstruct_rule_text()) domain_matches.update(matching.matches) cluster_type_hits[rule.name].add(cds_name) if domains_by_cluster: cds_domains_by_cluster_type[cds_name] = domains_by_cluster return cds_domains_by_cluster_type, cluster_type_hits
def setUp(self): self.cluster = create_cluster() self.cluster.core_location = FeatureLocation(30, 50) self.inside_cds = DummyCDS(40, 45) self.neighbour_cds = DummyCDS(20, 25) self.outside_cds = DummyCDS(120, 125) assert not self.cluster.cds_children assert not self.cluster.definition_cdses
def test_construction(self): loc = FeatureLocation(1, 15, 1) domain = Domain(loc, "test_type", tool="test") assert domain.type == "test_type" assert domain.location == loc assert domain.created_by_antismash assert domain.tool == "test" assert domain.domain is None
def test_probabilities(self): loc = FeatureLocation(0, 10) candidates = [DummyCandidateCluster([create_protocluster(0, 10)])] assert Region(candidate_clusters=candidates).probabilities == [] subs = [SubRegion(loc, "testtool", probability=None)] assert Region(candidate_clusters=candidates, subregions=subs).probabilities == [] subs.append(SubRegion(loc, "testtool", probability=0.1)) assert Region(candidate_clusters=candidates, subregions=subs).probabilities == [0.1] subs.append(SubRegion(loc, "testtool", probability=0.7)) assert Region(candidate_clusters=candidates, subregions=subs).probabilities == [0.1, 0.7]
def test_core(self): protos = [ create_cluster(5, 10, 20, 25, "a"), create_cluster(30, 40, 50, 60, "b") ] cluster = CandidateCluster(CandidateClusterKind.NEIGHBOURING, protos, smiles="dummy", polymer="dummy") assert cluster.core_location == FeatureLocation(10, 50)
def __init__(self, name=None, function="other", components=None, location=None, start=None, strand=1): if name is None: DummyReferenceCDS.counter += 1 name = f"test_ref_{DummyReferenceCDS.counter}" if components is None: components = {"secmet":[], "modules":[]} if location is None: if start is None: start = 20 location = FeatureLocation(start, start + 20, strand) super().__init__(name, function, components, location)
def test_translation(self): domain = Domain(FeatureLocation(1, 15, 1), "test_type", tool="test", protein_location=FeatureLocation(0, 3), locus_tag="locus") with self.assertRaisesRegex(ValueError, "has no translation"): assert domain.translation is None domain.translation = "AAA" assert domain.translation == "AAA" with self.assertRaisesRegex(ValueError, "stop codons"): domain.translation = "A*A" for value in [7, None, Domain]: with self.assertRaises(AssertionError): domain.translation = value with self.assertRaisesRegex(ValueError, "empty"): domain.translation = ""
def test_sideloaded(self): clusters = [ create_protocluster(3, 20, "prodA"), SideloadedProtocluster(FeatureLocation(25, 41), FeatureLocation(25, 41), "external", "prodB") ] candidate = CandidateCluster(CandidateCluster.kinds.NEIGHBOURING, clusters) subregions = [ SubRegion(FeatureLocation(35, 71), "test", 0.7), SideloadedSubRegion(FeatureLocation(45, 61), "external") ] region = Region(candidate_clusters=[candidate], subregions=subregions) sideloaded = region.get_sideloaded_areas() assert len(sideloaded) == 2 assert sideloaded[0] is clusters[1] assert sideloaded[1] is subregions[1]
def test_creation_mixed(self): cluster = create_cluster(3, 8, 71, 76, 'a') hybrid_cluster = create_cluster(50, 60, 120, 170, 'b') overlap_cluster = create_cluster(80, 90, 130, 180, 'o') neighbour_cluster = create_cluster(50, 210, 260, 270, 'a') isolated_cluster = create_cluster(450, 500, 550, 600, 'alone') # insert the cds that will cause the hybrid call cds_ab = create_cds(60, 65, ["a", "b"]) cluster.add_cds(cds_ab) hybrid_cluster.add_cds(cds_ab) created = creator([ cluster, hybrid_cluster, overlap_cluster, neighbour_cluster, isolated_cluster ]) print(created) assert len(created) == 5 assert created[0].location == FeatureLocation(3, 270) assert created[0].kind == SuperCluster.kinds.NEIGHBOURING assert created[0].clusters == (cluster, hybrid_cluster, overlap_cluster, neighbour_cluster) assert created[1].location == FeatureLocation(3, 180) assert created[1].kind == SuperCluster.kinds.INTERLEAVED assert created[1].clusters == (cluster, hybrid_cluster, overlap_cluster) assert created[2].location == FeatureLocation(3, 170) assert created[2].kind == SuperCluster.kinds.CHEMICAL_HYBRID assert created[2].clusters == (cluster, hybrid_cluster) assert created[3].location == FeatureLocation(50, 270) assert created[3].kind == SuperCluster.kinds.SINGLE assert created[3].clusters == (neighbour_cluster, ) assert created[4].location == FeatureLocation(450, 600) assert created[4].kind == SuperCluster.kinds.SINGLE assert created[4].clusters == (isolated_cluster, )
def test_creation_neighbours(self): cluster = create_cluster(3, 8, 71, 76, 'a') extra_cluster = create_cluster(50, 100, 120, 170, 'b') created = creator([cluster, extra_cluster]) print(created) assert len(created) == 3 expected_location = FeatureLocation(cluster.location.start, extra_cluster.location.end) assert created[0].kind == SuperCluster.kinds.NEIGHBOURING and created[ 0].location == expected_location assert created[1].kind == SuperCluster.kinds.SINGLE and created[ 1].location == cluster.location assert created[2].kind == SuperCluster.kinds.SINGLE and created[ 2].location == extra_cluster.location
def test_creation_coreoverlap(self): cluster = create_cluster(3, 8, 71, 76, 'a') extra_cluster = create_cluster(50, 60, 120, 170, 'b') # create a CDS within both clusters that has a product from only one cluster cds = create_cds(60, 65, ["a"]) cluster.add_cds(cds) extra_cluster.add_cds(cds) created = creator([cluster, extra_cluster]) print(created) assert len(created) == 1 supercluster = created[0] assert supercluster.kind == SuperCluster.kinds.INTERLEAVED assert supercluster.location == FeatureLocation(3, 170)
def test_product(self): loc = FeatureLocation(1, 6, strand=1) for bad in [ "-", "-like", "NRPS-", "NRPS PKS", "NRPS/PKS", "NRPS,PKS", "NRPS.PKS" ]: with self.assertRaisesRegex(ValueError, "invalid protocluster product"): Protocluster(loc, loc, tool="test", cutoff=17, neighbourhood_range=5, product=bad, detection_rule="some rule text")
def test_creation_hybrid(self): cluster = create_cluster(3, 8, 71, 76, 'a') hybrid_cluster = create_cluster(50, 60, 120, 170, 'b') # insert the cds that will cause the hybrid call cds_ab = create_cds(60, 65, ["a", "b"]) cluster.add_cds(cds_ab) hybrid_cluster.add_cds(cds_ab) created = creator([cluster, hybrid_cluster]) print(created) assert len(created) == 1 supercluster = created[0] assert supercluster.kind == SuperCluster.kinds.CHEMICAL_HYBRID assert supercluster.location == FeatureLocation(3, 170)
def test_probabilities(self): loc = FeatureLocation(0, 10) supers = [ SuperCluster(SuperCluster.kinds.SINGLE, [create_cluster(0, 10)]) ] assert Region(superclusters=supers).probabilities == [] subs = [SubRegion(loc, "testtool", probability=None)] assert Region(superclusters=supers, subregions=subs).probabilities == [] subs.append(SubRegion(loc, "testtool", probability=0.1)) assert Region(superclusters=supers, subregions=subs).probabilities == [0.1] subs.append(SubRegion(loc, "testtool", probability=0.7)) assert Region(superclusters=supers, subregions=subs).probabilities == [0.1, 0.7]
def test_limited_add_cds_propagation(self): cds = DummyCDS(0, 10) self.sub = SubRegion(FeatureLocation(20, 30), "testtool") self.region = Region(superclusters=[self.super], subregions=[self.sub]) # ensure all empty to start with assert not self.cluster.cds_children assert not self.super.cds_children assert not self.sub.cds_children assert not self.region.cds_children assert not cds.region self.region.add_cds(cds) assert self.cluster.cds_children == (cds, ) assert self.super.cds_children == (cds, ) assert not self.sub.cds_children assert self.region.cds_children == (cds, ) assert cds.region is self.region
def find_clusters(record: Record, cds_by_cluster_type: Dict[str, Set[str]], rules_by_name: Dict[str, rule_parser.DetectionRule]) -> List[Cluster]: """ Detects gene clusters based on the identified core genes """ clusters = [] # type: List[Cluster] cds_feature_by_name = record.get_cds_name_mapping() for cluster_type, cds_names in cds_by_cluster_type.items(): cds_features = sorted([cds_feature_by_name[cds] for cds in cds_names]) rule = rules_by_name[cluster_type] cutoff = rule.cutoff core_location = cds_features[0].location for cds in cds_features[1:]: if cds.overlaps_with(FeatureLocation(core_location.start - cutoff, core_location.end + cutoff)): core_location = FeatureLocation(min(cds.location.start, core_location.start), max(cds.location.end, core_location.end)) assert core_location.start >= 0 and core_location.end <= len(record) continue # create the previous cluster and start a new location surrounds = FeatureLocation(max(0, core_location.start - rule.extent), min(core_location.end + rule.extent, len(record))) surrounding_cdses = record.get_cds_features_within_location(surrounds, with_overlapping=False) real_start = min(contained.location.start for contained in surrounding_cdses) real_end = max(contained.location.end for contained in surrounding_cdses) surrounds = FeatureLocation(real_start, real_end) clusters.append(Cluster(core_location, surrounding_location=surrounds, tool="rule-based-clusters", cutoff=cutoff, neighbourhood_range=rule.extent, product=cluster_type, detection_rule=str(rule.conditions))) core_location = cds.location # finalise the last cluster surrounds = FeatureLocation(max(0, core_location.start - rule.extent), min(core_location.end + rule.extent, len(record))) clusters.append(Cluster(core_location, surrounding_location=surrounds, tool="rule-based-clusters", cutoff=cutoff, neighbourhood_range=rule.extent, product=cluster_type, detection_rule=str(rule.conditions))) # fit to record if outside for cluster in clusters: contained = FeatureLocation(max(0, cluster.location.start), min(cluster.location.end, len(record))) if contained != cluster.location: cluster.location = contained clusters = remove_redundant_clusters(clusters, rules_by_name) logging.debug("%d rule-based cluster(s) found in record", len(clusters)) return clusters