def run_detection(record: Record, options: ConfigType, module_results: Dict[str, Union[ModuleResults, Dict[str, Any]]]) -> Dict[str, float]: """ Detect different secondary metabolite clusters, PFAMs, and domains. Arguments: record: the Record to run detection over options: antiSMASH config module_results: a dictionary mapping a module's name to results from a previous run on this module, as a ModuleResults subclass or in JSON form Returns: the time taken by each detection module as a dictionary """ timings = {} # type: Dict[str, float] # run full genome detections for module in [full_hmmer]: run_module(record, cast(AntismashModule, module), options, module_results, timings) results = module_results.get(module.__name__) if results: assert isinstance(results, ModuleResults) logging.debug("Adding detection results from %s to record", module.__name__) results.add_to_record(record) # generate cluster predictions logging.info("Detecting secondary metabolite clusters") for module in [hmm_detection, cassis, clusterfinder_probabilistic]: run_module(record, cast(AntismashModule, module), options, module_results, timings) results = module_results.get(module.__name__) if results: assert isinstance(results, DetectionResults) for protocluster in results.get_predicted_protoclusters(): record.add_protocluster(protocluster) for region in results.get_predicted_subregions(): record.add_subregion(region) logging.debug("%d protoclusters found", len(record.get_protoclusters())) logging.debug("%d subregions found", len(record.get_subregions())) record.create_candidate_clusters() record.create_regions() if not record.get_regions(): logging.info("No regions detected, skipping record") record.skip = "No regions detected" return timings logging.info("%d region(s) detected in record", len(record.get_regions())) # finally, run any detection limited to genes in clusters for module in [nrps_pks_domains, cluster_hmmer, genefunctions]: run_module(record, cast(AntismashModule, module), options, module_results, timings) results = module_results.get(module.__name__) if results: assert isinstance(results, ModuleResults) logging.debug("Adding detection results from %s to record", module.__name__) results.add_to_record(record) return timings
def test_genbank(self): dummy_record = Record(Seq("A" * 100, generic_dna)) clusters = [ create_protocluster(3, 20, "prodA"), create_protocluster(25, 41, "prodB") ] for cluster in clusters: dummy_record.add_protocluster(cluster) subregion = SubRegion(FeatureLocation(35, 71), "test", 0.7) dummy_record.add_subregion(subregion) candidate = CandidateCluster(CandidateCluster.kinds.NEIGHBOURING, clusters) dummy_record.add_candidate_cluster(candidate) region = Region(candidate_clusters=[candidate], subregions=[subregion]) dummy_record.add_region(region) with NamedTemporaryFile(suffix=".gbk") as output: region.write_to_genbank(output.name) bio = list(seqio.parse(output.name)) assert len(bio) == 1 print(bio[0].features) rec = Record.from_biopython(bio[0], taxon="bacteria") assert len(rec.get_regions()) == 1 new = rec.get_region(0) assert new.location.start == 3 - region.location.start assert new.location.end == 71 - region.location.start assert new.products == region.products assert new.probabilities == region.probabilities
class HmmDetectionTest(unittest.TestCase): def setUp(self): self.config = build_config([]) self.rules_file = path.get_full_path(__file__, "..", "cluster_rules", "strict.txt") self.signature_file = path.get_full_path(__file__, "..", "data", "hmmdetails.txt") self.signature_names = { sig.name for sig in core.get_signature_profiles() } self.filter_file = path.get_full_path(__file__, "..", "filterhmmdetails.txt") self.results_by_id = { "GENE_1": [ FakeHSPHit("modelA", "GENE_1", 0, 10, 50, 0), FakeHSPHit("modelB", "GENE_1", 0, 10, 50, 0) ], "GENE_2": [ FakeHSPHit("modelC", "GENE_2", 0, 10, 50, 0), FakeHSPHit("modelB", "GENE_2", 0, 10, 50, 0) ], "GENE_3": [ FakeHSPHit("modelC", "GENE_3", 0, 10, 50, 0), FakeHSPHit("modelF", "GENE_3", 0, 10, 50, 0) ], "GENE_4": [ FakeHSPHit("modelA", "GENE_4", 0, 10, 50, 0), FakeHSPHit("modelE", "GENE_4", 0, 10, 50, 0) ], "GENE_5": [ FakeHSPHit("modelA", "GENE_5", 0, 10, 50, 0), FakeHSPHit("modelG", "GENE_5", 0, 10, 50, 0) ] } self.feature_by_id = { "GENE_1": DummyCDS(0, 30000, locus_tag="GENE_1"), "GENE_2": DummyCDS(30000, 50000, locus_tag="GENE_2"), "GENE_3": DummyCDS(70000, 90000, locus_tag="GENE_3"), "GENE_X": DummyCDS(95000, 100000, locus_tag="GENE_X"), # no hits "GENE_4": DummyCDS(125000, 140000, locus_tag="GENE_4"), "GENE_5": DummyCDS(130000, 150000, locus_tag="GENE_5") } self.test_names = { "modelA", "modelB", "modelC", "modelF", "modelG", "a", "b", "c", "d" } self.rules = rule_parser.Parser( "\n".join([ "RULE MetaboliteA CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS modelA", "RULE MetaboliteB CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS cds(modelA and modelB)", "RULE MetaboliteC CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS (modelA and modelB)", "RULE MetaboliteD CUTOFF 20 NEIGHBOURHOOD 5 CONDITIONS minimum(2,[modelC,modelB]) and modelA", "RULE Metabolite0 CUTOFF 1 NEIGHBOURHOOD 3 CONDITIONS modelF", "RULE Metabolite1 CUTOFF 1 NEIGHBOURHOOD 3 CONDITIONS modelG" ]), self.test_names).rules self.features = [] for gene_id in self.feature_by_id: self.features.append(self.feature_by_id[gene_id]) self.features.sort( key=lambda x: x.location.start) # vital for py3 < 3.5 self.record = Record() self.record._record.seq = Seq("A" * 150000) for feature in self.features: self.record.add_cds_feature(feature) def tearDown(self): # clear out any leftover config adjustments destroy_config() def test_overlaps_but_not_contains(self): # should get gene2 and gene3 rules = rule_parser.Parser( "\n".join([ "RULE Overlap CUTOFF 25 NEIGHBOURHOOD 5 CONDITIONS modelB and modelF " "RULE OverlapImpossible CUTOFF 25 NEIGHBOURHOOD 5 CONDITIONS modelA and modelF" ]), self.test_names).rules detected_types, cluster_type_hits = hmm_detection.apply_cluster_rules( self.record, self.results_by_id, rules) assert detected_types == { "GENE_2": { "Overlap": {"modelB"} }, "GENE_3": { "Overlap": {"modelF"} } } assert cluster_type_hits == {"Overlap": {"GENE_2", "GENE_3"}} # only 1 cluster should be found, since it requires both genes # if forming clusters by .is_contained_by(), 2 clusters will be formed # if finding rule hits uses .is_contained_by(), no clusters will be formed rules_by_name = {rule.name: rule for rule in rules} clusters = hmm_detection.find_protoclusters(self.record, cluster_type_hits, rules_by_name) assert len(clusters) == 1 assert clusters[0].product == "Overlap" assert clusters[0].core_location.start == 30000 assert clusters[0].core_location.end == 90000 def test_core(self): # should be no failing prerequisites assert core.check_prereqs(self.config) == [] # always runs assert core.is_enabled(None) def test_apply_cluster_rules(self): detected_types, cluster_type_hits = hmm_detection.apply_cluster_rules( self.record, self.results_by_id, self.rules) for gid in detected_types: detected_types[gid] = set(detected_types[gid]) expected_types = { "GENE_1": set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]), "GENE_2": set(["MetaboliteC", "MetaboliteD"]), "GENE_3": set(["Metabolite0"]), "GENE_4": set(["MetaboliteA"]), "GENE_5": set(["Metabolite1", "MetaboliteA"]) } assert detected_types == expected_types assert cluster_type_hits == { "MetaboliteA": {"GENE_1", "GENE_4", "GENE_5"}, "MetaboliteB": {"GENE_1"}, "MetaboliteC": {"GENE_1", "GENE_2"}, 'MetaboliteD': {'GENE_1', 'GENE_2'}, 'Metabolite0': {'GENE_3'}, 'Metabolite1': {'GENE_5'} } def test_find_protoclusters(self): cds_features_by_type = { "MetaboliteA": {"GENE_1", "GENE_4", "GENE_5"}, "MetaboliteB": {"GENE_1"}, "MetaboliteC": {"GENE_1", "GENE_2"}, 'MetaboliteD': {'GENE_1', 'GENE_2'}, 'Metabolite0': {'GENE_3'}, 'Metabolite1': {'GENE_5'} } rules = {rule.name: rule for rule in self.rules} for cluster in hmm_detection.find_protoclusters( self.record, cds_features_by_type, rules): self.record.add_protocluster(cluster) assert len(self.record.get_protoclusters()) == 7 cluster_products = sorted( [cluster.product for cluster in self.record.get_protoclusters()]) assert cluster_products == sorted( ["Metabolite%s" % i for i in "01AABCD"]) self.record.create_candidate_clusters() assert len(self.record.get_candidate_clusters()) == 3 self.record.create_regions() assert len(self.record.get_regions()) == 3 result_regions = [] for region in self.record.get_regions(): result_regions.append( sorted(cds.get_name() for cds in region.cds_children)) expected_regions = [["GENE_1", "GENE_2"], ["GENE_3"], ["GENE_4", "GENE_5"]] assert result_regions == expected_regions def test_create_rules(self): rules = hmm_detection.create_rules(self.rules_file, self.signature_names) assert len(rules) == open(self.rules_file).read().count("\nRULE") t1pks_rules = [rule for rule in rules if rule.name == "T1PKS"] assert len(t1pks_rules) == 1 rule = t1pks_rules[0] assert rule.cutoff == 20000 assert rule.neighbourhood == 20000 def test_profiles_parsing(self): profiles = signatures.get_signature_profiles() assert len(profiles) == 250 # ensures we don't delete any by accident def test_filter(self): # fake HSPs all in one CDS with overlap > 20 and query_ids from the same equivalence group # not overlapping by > 20 first = FakeHSPHit("AMP-binding", "A", 50, 90, 0.1, None) second = FakeHSPHit("A-OX", "A", 70, 100, 0.5, None) new, by_id = hmm_detection.filter_results([first, second], {"A": [first, second]}, self.filter_file, self.signature_names) assert new == [first, second] assert by_id == {"A": [first, second]} # overlapping, in same group first.hit_end = 91 assert hmm_detection.hsp_overlap_size(first, second) == 21 new, by_id = hmm_detection.filter_results([first, second], {"A": [first, second]}, self.filter_file, self.signature_names) assert new == [second] assert by_id == {"A": [second]} # overlapping, not in same group second.query_id = "none" new, by_id = hmm_detection.filter_results([first, second], {"A": [first, second]}, self.filter_file, self.signature_names) assert new == [first, second] assert by_id == {"A": [first, second]} # not in the same CDS, but int he same group second.hit_id = "B" second.query_id = "A-OX" new, by_id = hmm_detection.filter_results([first, second], { "A": [first], "B": [second] }, self.filter_file, self.signature_names) assert new == [first, second] assert by_id == {"A": [first], "B": [second]} def test_filter_multiple(self): # all in one CDS no overlap and the same query_ids -> cull all but the best score # not overlapping, not same query_id first = FakeHSPHit("AMP-binding", "A", 50, 60, 0.1, None) second = FakeHSPHit("A-OX", "A", 70, 100, 0.5, None) both = [first, second] by_id = {"A": [first, second]} new, by_id = hmm_detection.filter_result_multiple( list(both), dict(by_id)) assert new == [first, second] assert by_id == {"A": [first, second]} # not overlapping, same query_id first.query_id = "A-OX" new, by_id = hmm_detection.filter_result_multiple( list(both), dict(by_id)) assert new == [second] assert by_id == {"A": [second]} # not in same CDS, same query_id second.hit_id = "B" by_id = {"A": [first], "B": [second]} new, by_id = hmm_detection.filter_result_multiple( list(both), dict(by_id)) assert new == [first, second] assert by_id == {"A": [first], "B": [second]} def test_equivalence_groups(self): group_file = path.get_full_path(os.path.dirname(__file__), "filterhmmdetails.txt") sets = [] with open(group_file) as group_lines: sets = [set(line.strip().split(',')) for line in group_lines] # ensure they have at least two elements assert all(len(s) > 1 for s in sets) # ensure that the groups are disjoint for i, group in enumerate(sets): for other in sets[i + 1:]: assert group.isdisjoint(other) def test_hsp_overlap_size(self): overlap_size = hmm_detection.hsp_overlap_size first = FakeHSPHit("A", "A", 50, 60, 0., None) second = FakeHSPHit("B", "B", 70, 100, 0., None) # no overlap assert overlap_size(first, second) == 0 first.hit_end = 70 # still no overlap, end isn't inclusive assert overlap_size(first, second) == 0 # a mix of second starting inside first for i in range(1, 30): first.hit_end += 1 assert overlap_size(first, second) == i # second wholly contained first.hit_end = 110 assert overlap_size(first, second) == 30 # first inside second first.hit_start = 75 assert overlap_size(first, second) == 25 # first inside second, but direction reversed first.hit_end = 50 with self.assertRaises(AssertionError): overlap_size(first, second) def test_hmm_files_and_details_match(self): data_dir = path.get_full_path(os.path.dirname(__file__), "data", "") details_files = { prof.path for prof in signatures.get_signature_profiles() } details_files = { filepath.replace(data_dir, "") for filepath in details_files } data_dir_contents = set(glob.glob(data_dir + "*.hmm")) data_dir_contents = { filepath.replace(data_dir, "") for filepath in data_dir_contents } # ignore bgc_seeds.hmm for the sake of comparison, it's a generated aggregate data_dir_contents.discard("bgc_seeds.hmm") missing_files = details_files - data_dir_contents assert not missing_files extra_files = data_dir_contents - details_files assert not extra_files # finally, just to be sure assert data_dir_contents == details_files
class TestCandidateCluster(unittest.TestCase): def setUp(self): self.record = Record(Seq("A" * 100)) clusters = [create_cluster(0, 0, 10, 10)] for cluster in clusters: self.record.add_protocluster(cluster) def test_kinds_attachment(self): assert CandidateCluster.kinds == CandidateClusterKind def test_record_linkage(self): cluster = CandidateCluster(CandidateCluster.kinds.INTERLEAVED, self.record.get_protoclusters()) with self.assertRaisesRegex( ValueError, "CandidateCluster not contained in record"): cluster.get_candidate_cluster_number() self.record.add_candidate_cluster(cluster) assert cluster.get_candidate_cluster_number() == 1 def test_bad_kind(self): with self.assertRaisesRegex(TypeError, "should be CandidateClusterKind"): CandidateCluster("berf", self.record.get_protoclusters()) def test_no_clusters(self): with self.assertRaisesRegex(ValueError, "cannot exist without at least one"): CandidateCluster(CandidateCluster.kinds.INTERLEAVED, []) def test_rules(self): cluster = CandidateCluster(CandidateCluster.kinds.INTERLEAVED, self.record.get_protoclusters()) assert cluster.detection_rules == [ cluster.detection_rule for cluster in self.record.get_protoclusters() ] def test_smiles_and_polymer(self): cluster = CandidateCluster(CandidateCluster.kinds.INTERLEAVED, self.record.get_protoclusters()) assert cluster.smiles_structure is None assert cluster.polymer is None def test_conversion(self): kind = CandidateClusterKind.INTERLEAVED original = CandidateCluster(kind, self.record.get_protoclusters(), smiles="dummy smiles", polymer="dummy polymer") self.record.add_candidate_cluster(original) assert original.products == ["a"] assert len(original.protoclusters) == 1 bios = original.to_biopython() assert len(bios) == 1 bio = bios[0] assert bio.qualifiers["product"] == ["a"] assert bio.qualifiers["kind"] == [str(kind)] assert bio.qualifiers["candidate_cluster_number"] == [ str(original.get_candidate_cluster_number()) ] assert bio.qualifiers["SMILES"] == ["dummy smiles"] assert bio.qualifiers["polymer"] == ["dummy polymer"] assert bio.qualifiers["contig_edge"] == ["True"] real = CandidateCluster.from_biopython(bio, record=self.record) assert isinstance(real, CandidateCluster) assert len(real.protoclusters) == len(self.record.get_protoclusters()) for reference, record_cluster in zip(real.protoclusters, self.record.get_protoclusters()): assert reference is record_cluster # attempt a conversion with a record missing the cluster self.record.clear_protoclusters() with self.assertRaisesRegex( ValueError, "record does not contain all expected protoclusters"): CandidateCluster.from_biopython(bio, record=self.record) # and with no record with self.assertRaisesRegex(ValueError, "record instance required"): CandidateCluster.from_biopython(bio) def test_core(self): protos = [ create_cluster(5, 10, 20, 25, "a"), create_cluster(30, 40, 50, 60, "b") ] cluster = CandidateCluster(CandidateClusterKind.NEIGHBOURING, protos, smiles="dummy", polymer="dummy") assert cluster.core_location == FeatureLocation(10, 50) def test_comparison(self): candidate = CandidateCluster(CandidateClusterKind.NEIGHBOURING, [create_cluster(5, 10, 20, 25, "a")]) longer = CandidateCluster(CandidateClusterKind.NEIGHBOURING, [create_cluster(5, 10, 40, 45, "a")]) after = CandidateCluster(CandidateClusterKind.NEIGHBOURING, [create_cluster(10, 20, 40, 45, "a")]) def check(first, second): assert first < second assert first < second.location assert sorted([second, first]) == [first, second] check(candidate, after) check(longer, candidate) check(longer, after) assert sorted([after, candidate, longer]) == [longer, candidate, after]
class TestCandidateCluster(unittest.TestCase): def setUp(self): self.record = Record(Seq("A" * 100)) clusters = [create_cluster(0, 0, 10, 10)] for cluster in clusters: self.record.add_protocluster(cluster) def test_kinds_attachment(self): assert CandidateCluster.kinds == CandidateClusterKind def test_record_linkage(self): cluster = CandidateCluster(CandidateCluster.kinds.INTERLEAVED, self.record.get_protoclusters()) with self.assertRaisesRegex( ValueError, "CandidateCluster not contained in record"): cluster.get_candidate_cluster_number() self.record.add_candidate_cluster(cluster) assert cluster.get_candidate_cluster_number() == 1 def test_bad_kind(self): with self.assertRaisesRegex(TypeError, "should be CandidateClusterKind"): CandidateCluster("berf", self.record.get_protoclusters()) def test_no_clusters(self): with self.assertRaisesRegex(ValueError, "cannot exist without at least one"): CandidateCluster(CandidateCluster.kinds.INTERLEAVED, []) def test_rules(self): cluster = CandidateCluster(CandidateCluster.kinds.INTERLEAVED, self.record.get_protoclusters()) assert cluster.detection_rules == [ cluster.detection_rule for cluster in self.record.get_protoclusters() ] def test_smiles_and_polymer(self): cluster = CandidateCluster(CandidateCluster.kinds.INTERLEAVED, self.record.get_protoclusters()) assert cluster.smiles_structure is None assert cluster.polymer is None def test_conversion(self): kind = CandidateClusterKind.INTERLEAVED original = CandidateCluster(kind, self.record.get_protoclusters(), smiles="dummy smiles", polymer="dummy polymer") self.record.add_candidate_cluster(original) assert original.products == ["a"] assert len(original.protoclusters) == 1 bios = original.to_biopython() assert len(bios) == 1 bio = bios[0] assert bio.qualifiers["product"] == ["a"] assert bio.qualifiers["kind"] == [str(kind)] assert bio.qualifiers["candidate_cluster_number"] == [ str(original.get_candidate_cluster_number()) ] assert bio.qualifiers["SMILES"] == ["dummy smiles"] assert bio.qualifiers["polymer"] == ["dummy polymer"] assert bio.qualifiers["contig_edge"] == ["True"] regenerated = CandidateCluster.from_biopython(bio) assert isinstance(regenerated, TemporaryCandidateCluster) assert regenerated.products == original.products assert regenerated.location == original.location assert regenerated.smiles_structure == original.smiles_structure assert regenerated.polymer == original.polymer proto_numbers = [ cluster.get_protocluster_number() for cluster in self.record.get_protoclusters() ] assert regenerated.protoclusters == proto_numbers assert regenerated.kind == original.kind real = regenerated.convert_to_real_feature(self.record) assert isinstance(real, CandidateCluster) assert len(real.protoclusters) == len(self.record.get_protoclusters()) for reference, record_cluster in zip(real.protoclusters, self.record.get_protoclusters()): assert reference is record_cluster # attempt a conversion with a record missing the cluster self.record.clear_protoclusters() with self.assertRaisesRegex(ValueError, "Not all referenced clusters are present"): regenerated.convert_to_real_feature(self.record)