def setUp(self): self.config = build_config(["--cf-create-clusters", "--cf-mean-threshold", "0.6", "--cf-min-cds", "5", "--cf-min-pfams", "5"], modules=[clusterfinder], isolated=True) update_config({"enabled_cluster_types": []}) self.record = DummyRecord(seq=Seq("A" * 2000)) for start, end, probability, pfam_id in [(10, 20, 0.1, 'PF77777'), (30, 40, 0.3, 'PF00106'), (50, 60, 0.4, 'PF00107'), (60, 70, 0.7, 'PF00109'), (70, 80, 0.98, 'PF08484'), (90, 100, 0.8, 'PF02401'), (100, 110, 0.32, 'PF04369'), (110, 120, 1.0, 'PF00128'), (130, 140, 0.2, 'PF77776'), (500, 505, None, 'PF77775'), (1010, 1020, 0.1, 'PF77774'), (1030, 1040, 0.3, 'PF00106'), (1050, 1060, 0.4, 'PF00107'), (1060, 1070, 0.7, 'PF00109'), (1070, 1080, 0.98, 'PF08484'), (1090, 1100, 0.8, 'PF02401'), (1100, 1110, 0.32, 'PF04369'), (1110, 1120, 1.0, 'PF00128')]: location = FeatureLocation(start, end, strand=1) self.record.add_cds_feature(CDSFeature(location, locus_tag=str(start), translation="A")) pfam = PFAMDomain(location, "dummy_description", protein_start=start + 1, protein_end=end-1, identifier=pfam_id, tool="test") pfam.domain_id = "pfam_%d" % start pfam.probability = probability self.record.add_pfam_domain(pfam)
def test_results_reconstruction(self): def check_results(results): assert results.record_id == "rec_id" assert results.tool == "toolname" assert isinstance(results.best_hits["cds1"], HMMResult) assert results.best_hits["cds1"].hit_id == 'desc1' assert results.best_hits["cds2"].bitscore == 20 assert results.function_mapping["cds2"] == GeneFunction.REGULATORY hits = { "cds1": HMMResult("desc1", 0, 100, 2.3e-126, 416), "cds2": HMMResult("desc2", 5, 60, 3e-16, 20), } mapping = { "cds1": GeneFunction.TRANSPORT, "cds2": GeneFunction.REGULATORY, } results = self.res_class("rec_id", "toolname", best_hits=hits, function_mapping=mapping) check_results(results) json = results.to_json() assert json["best_hits"]["cds1"][0] == hits["cds1"].hit_id record = DummyRecord() record.id = "rec_id" reconstructed = self.res_class.from_json(json, record) check_results(reconstructed)
def setUp(self): self.config = build_config([ "--cf-create-clusters", "--cf-mean-threshold", "0.6", "--cf-min-cds", "5", "--cf-min-pfams", "5" ], modules=[clusterfinder], isolated=True) update_config({"enabled_cluster_types": []}) self.record = DummyRecord(seq=Seq("A" * 2000)) for start, end, probability, pfam_id in [(10, 20, 0.1, 'FAKE007'), (30, 40, 0.3, 'PF00106'), (50, 60, 0.4, 'PF00107'), (60, 70, 0.7, 'PF00109'), (70, 80, 0.98, 'PF08484'), (90, 100, 0.8, 'PF02401'), (100, 110, 0.32, 'PF04369'), (110, 120, 1.0, 'PF00128'), (130, 140, 0.2, 'FAKE234'), (500, 505, None, 'FAKE505'), (1010, 1020, 0.1, 'FAKE007'), (1030, 1040, 0.3, 'PF00106'), (1050, 1060, 0.4, 'PF00107'), (1060, 1070, 0.7, 'PF00109'), (1070, 1080, 0.98, 'PF08484'), (1090, 1100, 0.8, 'PF02401'), (1100, 1110, 0.32, 'PF04369'), (1110, 1120, 1.0, 'PF00128')]: location = FeatureLocation(start, end) self.record.add_cds_feature( CDSFeature(location, locus_tag=str(start))) pfam = PFAMDomain(location, "dummy_description") pfam.db_xref.append(pfam_id) pfam.probability = probability self.record.add_pfam_domain(pfam)
def test_antismash_comment(self): rec = DummyRecord() options = Namespace() options.start = -1 options.end = -1 options.version = "5.dummy" bio = rec.to_biopython() main.add_antismash_comments([(rec, bio)], options) assert "##antiSMASH-Data-START##" in bio.annotations["comment"] assert "##antiSMASH-Data-END##" in bio.annotations["comment"] assert "Version" in bio.annotations["comment"] and options.version in bio.annotations["comment"] assert "Original ID" not in bio.annotations["comment"] assert "Starting at" not in bio.annotations["comment"] assert "Ending at" not in bio.annotations["comment"] bio.annotations["comment"] = "" options.start = 7 main.add_antismash_comments([(rec, bio)], options) assert "Original ID" not in bio.annotations["comment"] assert "Starting at :: 7\n" in bio.annotations["comment"] bio.annotations["comment"] = "" options.start = -1 options.end = 1000 main.add_antismash_comments([(rec, bio)], options) assert "Original ID" not in bio.annotations["comment"] assert "Ending at :: 1000\n" in bio.annotations["comment"] bio.annotations["comment"] = "" options.end = -1 rec.original_id = "something else" main.add_antismash_comments([(rec, bio)], options) assert "Original ID" in bio.annotations["comment"] and "something else" in bio.annotations["comment"]
def test_add_to_incorrect_record(self): results = self.create_results(record_id=self.record.id) with self.assertRaisesRegex( ValueError, "Record to store in and record analysed don't match"): other = DummyRecord() other.id = self.record.id * 2 results.add_to_record(other)
def test_add_to_record(self): record = DummyRecord() results = self.create_results(record_id=record.id) assert not record.get_all_features() results.add_to_record(record) assert len(record.get_all_features()) == 2 assert len(record.get_antismash_domains_by_tool(TOOL)) == 2
def test_complex(self): protos = [ helpers.DummyProtocluster(core_start=3, core_end=22, neighbourhood_range=1, product='a'), helpers.DummyProtocluster(core_start=33, core_end=48, neighbourhood_range=2, product='b'), ] candidates = [ helpers.DummyCandidateCluster(clusters=protos[:1]), helpers.DummyCandidateCluster( clusters=protos, kind=helpers.DummyCandidateCluster.kinds.INTERLEAVED) ] subregion = helpers.DummySubRegion(start=55, end=90) regions = [ helpers.DummyRegion(candidate_clusters=candidates, subregions=[]), helpers.DummyRegion(candidate_clusters=[], subregions=[subregion]), ] record = DummyRecord(record_id="test", seq="A" * 90, features=regions + [subregion] + candidates + protos) results = serialiser.gather_record_areas(record) assert results assert len(results) == 2 res = results[0] assert res["end"] == protos[1].location.end assert res["products"] == [p.product for p in protos] assert res["subregions"] == [] assert len(res["protoclusters"]) == 2 assert list(res["protoclusters"]) == [0, 1] assert res["protoclusters"][1]["product"] == protos[1].product assert len(res["candidates"]) == 2 assert res["candidates"][0]["protoclusters"] == [0] assert res["candidates"][1]["protoclusters"] == [0, 1] res = results[1] assert res["start"] == subregion.location.start assert res["subregions"] assert res["subregions"][0]["start"] == subregion.location.start assert res["subregions"][0]["tool"] == subregion.tool assert res["subregions"][0]["label"] == subregion.label assert not res["protoclusters"] # lastly, check all this is properly embedded in the final JSON bio = record.to_biopython() full = serialiser.dump_records([bio], [{}], [record]) assert full[0]["areas"] == results
def test_regeneration(self): record = DummyRecord() record.id = "rec_name" as_json = json.loads(json.dumps(self.results.to_json())) regenerated = structures.SideloadedResults.from_json(as_json, record) assert regenerated.protoclusters == self.results.protoclusters assert regenerated.subregions == self.results.subregions assert regenerated.record_id == self.results.record_id as_json["schema_version"] = -1 with self.assertRaisesRegex(ValueError, "Detection results have changed"): structures.SideloadedResults.from_json(as_json, record)
def test_bad_biopython_conversion(self): record = DummyRecord() module = create_module() add_module_references_to_record(module, record) for removed_key in ["domains", "incomplete"]: bio = module.to_biopython()[0] bio.qualifiers.pop(removed_key) with self.assertRaisesRegex(ValueError, "missing .* '?%s" % removed_key): Module.from_biopython(bio, record=record) bio = module.to_biopython()[0] bio.qualifiers["type"] = ["not a valid type"] with self.assertRaisesRegex(ValueError, "unknown module type"): Module.from_biopython(bio, record=record) bio = module.to_biopython()[0] for bad in [ "missing", "has -> some -> extra", "missing ->", "-> missing", "->" ]: bio.qualifiers["monomer_pairings"] = [bad] with self.assertRaisesRegex(ValueError, "invalid monomer pairing"): Module.from_biopython(bio, record=record)
def test_detailed_biopython_conversion(self): expected_qualifiers = { "domains", "locus_tags", "complete", "starter_module", "final_module", "monomer_pairings" } original = create_module(starter=True, final=True, complete=True, iterative=True) original.add_monomer("from", "to") bio = original.to_biopython() assert set(bio[0].qualifiers.keys()).issuperset(expected_qualifiers) assert bio[0].qualifiers["complete"] is None assert "incomplete" not in bio[0].qualifiers record = DummyRecord() add_module_references_to_record(original, record) final = Module.from_biopython(bio[0], record=record) assert final.location == original.location assert final.is_starter_module() assert final.is_final_module() assert final.is_iterative() assert final.monomers == original.monomers assert final.is_complete() assert final.location == original.location
def setUp(self): self.feature_by_id = { "GENE_1": DummyCDS(0, 30000, locus_tag="GENE_1"), "GENE_2": DummyCDS(30000, 50000, locus_tag="GENE_2"), "GENE_3": DummyCDS(70000, 90000, locus_tag="GENE_3"), "GENE_X": DummyCDS(95000, 100000, locus_tag="GENE_X"), "GENE_4": DummyCDS(125000, 140000, locus_tag="GENE_4"), "GENE_5": DummyCDS(145000, 150000, locus_tag="GENE_5") } self.features = list(self.feature_by_id.values()) self.features.sort(key=lambda x: x.location.start) # vital for py3 < 3.5 self.record = DummyRecord(self.features) self.results_by_id = { "GENE_1": [FakeHSPHit("a", "GENE_1", 0, 10, 50, 0), FakeHSPHit("b", "GENE_1", 0, 10, 50, 0)], "GENE_2": [FakeHSPHit("a", "GENE_1", 0, 10, 50, 0), FakeHSPHit("c", "GENE_1", 0, 10, 50, 0)], "GENE_3": [FakeHSPHit("b", "GENE_1", 0, 10, 50, 0), FakeHSPHit("c", "GENE_1", 0, 10, 50, 0)], "GENE_4": [FakeHSPHit("e", "GENE_1", 0, 10, 50, 0), FakeHSPHit("f", "GENE_1", 0, 10, 50, 0)], "GENE_5": [FakeHSPHit("f", "GENE_1", 0, 10, 50, 0), FakeHSPHit("g", "GENE_1", 0, 10, 50, 0)]} self.signature_names = set(["a", "b", "c", "d", "e", "f", "g", "modelA", "modelB"])
def test_multi_cds_tracking(self): domains = [DummyAntismashDomain(locus_tag=i) for i in "AB"] module = create_module(domains=domains) assert module.is_multigene_module() record = DummyRecord() add_module_references_to_record(module, record) record.add_cds_feature(DummyCDS(locus_tag="C")) for cds in record.get_cds_features(): assert not cds.modules assert not record.get_modules() record.add_module(module) # make sure it's not added to every CDS assert not record.get_cds_by_name("C").modules # but that it is added to all CDSes with a domain included for i in "AB": assert record.get_cds_by_name(i).modules == (module, )
def test_detailed_biopython_conversion(self): expected_qualifiers = { "domains", "locus_tags", "complete", "starter_module", "final_module", "monomer_pairings" } original = create_module(starter=True, final=True, complete=True, iterative=True) original.add_monomer("from", "to") bio = original.to_biopython() assert set(bio[0].qualifiers.keys()).issuperset(expected_qualifiers) assert bio[0].qualifiers["complete"] is None assert "incomplete" not in bio[0].qualifiers assert all(domain and " " not in domain for domain in bio[0].qualifiers["domains"]) # fake an inserted space due to length first = bio[0].qualifiers["domains"][0] half = len(first) // 2 bio[0].qualifiers["domains"][0] = first[:half] + " " + first[half:] record = DummyRecord() add_module_references_to_record(original, record) final = Module.from_biopython(bio[0], record=record) assert final.location == original.location assert final.is_starter_module() assert final.is_final_module() assert final.is_iterative() assert final.monomers == original.monomers assert final.is_complete() assert final.location == original.location
def test_blank_records(self): blank_no_pfams = DummyRecord() blank_no_ids = Record(Seq("ATGTTATGAGGGTCATAACAT", generic_dna)) fake_pfam = DummyPFAMDomain(identifier="PF00000") blank_no_ids.add_pfam_domain(fake_pfam) assert not pfam2go.get_gos_for_pfams(blank_no_pfams) assert not pfam2go.get_gos_for_pfams(blank_no_ids)
def test_prediction_with_core_class2(self, _patched_rodeo): # the cleavage result adjusted to leave at least one amino in core cleavage_result = CleavageSiteHit(end=40, score=-6.8, lantype="Class-II") with patch.object(lanthi, "predict_cleavage_site", return_value=cleavage_result): results = run_lanthipred(DummyRecord(features=[self.cds]), self.cds, "Class-II", self.domains) assert results is not None assert str(results).startswith("Lanthipeptide(..40, -6.8, 'Class-II', 'LSQGLGGC', 1, 715")
def test_features_from_file(self): filename = path.get_full_path(__file__, 'data', 'fumigatus.cluster1.gff') record = DummyRecord() features = gff_parser.get_features_from_file(record, open(filename)) assert len(features) == 11 for feature in features: assert feature.type == 'CDS' assert isinstance(feature.location, CompoundLocation)
def test_conversion_bad_record(self): bio = create_module().to_biopython()[0] with self.assertRaisesRegex(ValueError, "record instance required"): Module.from_biopython(bio) with self.assertRaisesRegex( ValueError, "does not contain domain referenced by module"): Module.from_biopython(bio, record=DummyRecord())
def _create_dummy_record(reverse=False): seq = Seq('GTGGAGCGGTACTAAATGTACTCCACTATCTGCTGATTGGAAACCACGGAGCGCTCTTAG', generic_dna) strand = 1 if reverse: seq = seq.reverse_complement() strand = -1 rec = DummyRecord(seq=str(seq)) idx = 1 for start, end in [(0, 15), (15, 36), (36, 60)]: if reverse: start, end = len(seq) - end + 3, len(seq) - start # TODO: check this rec.add_cds_feature(DummyCDS(start, end, strand=strand, locus_tag="orf%04d" % idx)) idx += 1 return rec
def test_prediction_with_core(self): # the cleavage result adjusted to leave at least one amino in core cleavage_result = CleavageSiteHit(start=38, end=40, score=-6.8, lantype="Class-II") mock("lanthi.predict_cleavage_site", returns=cleavage_result) for part in ["I", "II"]: results = run_lanthipred(DummyRecord(features=[self.cds]), self.cds, "Class-%s" % part, self.domains) assert results is not None
def test_prediction_with_core_class1(self): # the cleavage result adjusted to leave at least one amino in core cleavage_result = CleavageSiteHit(end=40, score=-6.8, lantype="Class-I") mock("lanthi.predict_cleavage_site", returns=cleavage_result) results = run_lanthipred(DummyRecord(features=[self.cds]), self.cds, "Class-I", self.domains) assert results assert str(results).startswith( "Lanthipeptide(..40, -6.8, 'Class-I', 'LSQGLGGC', 1, 715")
def test_translation_outside_record(self): rec = DummyRecord(seq="A" * 10) for location in [ FeatureLocation(0, AfterPosition(6), strand=1), FeatureLocation(BeforePosition(4), 10, strand=-1) ]: bio = SeqFeature(location, type="CDS") bio.qualifiers["translation"] = ["M" * 5] with self.assertRaisesRegex(SecmetInvalidInputError, "translation extends out of record"): CDSFeature.from_biopython(bio, record=rec)
def set_dummy_with_pfams(pfam_ids: Dict[str, FeatureLocation]) -> DummyRecord: pfam_domains = [] for pfam_id, pfam_location in pfam_ids.items(): domain_id = '%s.%d.%d' % (pfam_id, pfam_location.start, pfam_location.end) pfam_domain = DummyPFAMDomain(location=pfam_location, protein_start=0, protein_end=5, identifier=pfam_id, domain_id=domain_id) pfam_domains.append(pfam_domain) return DummyRecord(features=pfam_domains)
def set_dummy_with_pfams(pfam_ids: Dict[str, FeatureLocation]) -> DummyRecord: pfam_domains = [] for pfam_id, pfam_location in pfam_ids.items(): pfam_domain = PFAMDomain(location=pfam_location, description='FAKE', protein_start=0, protein_end=5) pfam_domain.db_xref = [pfam_id] pfam_domain.domain_id = '%s.%d.%d' % (pfam_id, pfam_location.start, pfam_location.end) pfam_domains.append(pfam_domain) return DummyRecord(features=pfam_domains)
def setUp(self): self.res_class = genefunctions.core.FunctionResults hits = {"cds1": HMMResult("desc1", 0, 100, 2.3e-126, 416), "cds2": HMMResult("desc2", 5, 60, 3e-16, 20), } mapping = {"cds1": GeneFunction.TRANSPORT, "cds2": GeneFunction.REGULATORY, } self.record = DummyRecord() self.record.id = "rec_id" self.results = self.res_class(self.record.id, "toolname", best_hits=hits, function_mapping=mapping)
def setUp(self): self.config = get_simple_options(None, []) self.config.genefinding_gff3 = path.get_full_path(__file__, "data", "test_gff.gff") self.single_entry = False contig1 = DummyRecord(seq="A"*2000) contig1.id = "CONTIG_1" contig2 = DummyRecord(seq="A"*2000) contig2.id = "CONTIG_2" self.sequences = [contig1, contig2]
def test_blank_records(self): blank_no_pfams = DummyRecord() blank_no_ids = Record(Seq("ATGTTATGAGGGTCATAACAT", generic_dna)) fake_pfam_location = FeatureLocation(0, 12) fake_pfam = PFAMDomain(location=fake_pfam_location, description='MCPsignal', protein_start=0, protein_end=5, identifier="PF00000", tool="test") fake_pfam.domain_id = 'BLANK' blank_no_ids.add_pfam_domain(fake_pfam) assert not pfam2go.get_gos_for_pfams(blank_no_pfams) assert not pfam2go.get_gos_for_pfams(blank_no_ids)
def test_blank_records(self): blank_no_pfams = DummyRecord() blank_no_ids = Record(Seq("ATGTTATGAGGGTCATAACAT", generic_dna)) fake_pfam_location = FeatureLocation(0, 12) fake_pfam = PFAMDomain(location=fake_pfam_location, description='MCPsignal', protein_start=0, protein_end=5) fake_pfam.domain_id = 'BLANK' blank_no_ids.add_pfam_domain(fake_pfam) with self.assertLogs(level='DEBUG') as log_cm: gos_for_no_pfams = pfam2go.get_gos_for_pfams(blank_no_pfams) assert 'No Pfam domains found' in str(log_cm.output) assert not gos_for_no_pfams gos_for_no_ids = pfam2go.get_gos_for_pfams(blank_no_ids) assert 'No Pfam ids found' in str(log_cm.output) assert not gos_for_no_ids
def make_dummy_record(self): cds1 = DummyCDS(start=800, end=2150, locus_tag='a') cds2 = DummyCDS(start=3400, end=4700, locus_tag='b') cds3 = DummyCDS(start=150, end=450, locus_tag='c') p1 = DummyProtocluster(core_start=100, core_end=2200, neighbourhood_range=100, product='lanthipeptide-class-i') p2 = DummyProtocluster(core_start=3300, core_end=4800, neighbourhood_range=100, product='thiopeptide') dc1 = DummyCandidateCluster(clusters=[p1]) dc2 = DummyCandidateCluster(clusters=[p2]) region = DummyRegion(candidate_clusters=[dc1, dc2]) return DummyRecord(seq='FAKESEQ' * 1000, features=[cds1, cds2, cds3, p1, p2, region])
def test_minimal_biopython_conversion(self): original = create_module() bio = original.to_biopython() assert isinstance(bio, list) and len(bio) == 1 assert bio[0].location == original.location expected_qualifiers = {"domains", "locus_tags", "incomplete"} assert set(bio[0].qualifiers.keys()).issuperset(expected_qualifiers) assert bio[0].qualifiers["incomplete"] is None assert "complete" not in bio[0].qualifiers record = DummyRecord() add_module_references_to_record(original, record) final = Module.from_biopython(bio[0], record=record) assert final.location == original.location assert not final.is_starter_module() assert not final.is_final_module() assert not final.is_iterative() assert not final.monomers assert not final.is_complete() assert final.location == original.location
def setUp(self): self.res_class = genefunctions.AllFunctionResults self.record = DummyRecord() self.record.id = "rec_id" self.results = self.res_class(self.record.id)