def generate_motif_features(feature: CDSFeature, motifs: List[HMMResult]) -> List[CDSMotif]: """ Convert a list of HMMResult to a list of CDSMotif features """ # use a locus tag if one exists locus_tag = feature.get_name() if feature.locus_tag: locus_tag = feature.locus_tag motif_features = [] for i, motif in enumerate(motifs): i += 1 # user facing, so 1-indexed loc = feature.get_sub_location_from_protein_coordinates( motif.query_start, motif.query_end) prot_loc = FeatureLocation(motif.query_start, motif.query_end) new_motif = CDSMotif(loc, feature.get_name(), prot_loc, tool="nrps_pks_domains") new_motif.label = motif.hit_id new_motif.domain_id = 'nrpspksmotif_{}_{:04d}'.format(locus_tag, i) new_motif.evalue = motif.evalue new_motif.score = motif.bitscore new_motif.detection = "hmmscan" new_motif.database = "abmotifs" new_motif.locus_tag = locus_tag new_motif.translation = feature.translation[motif.query_start:motif. query_end] motif_features.append(new_motif) return motif_features
def generate_domain_features(gene: CDSFeature, domains: List[HMMResult]) -> Dict[HMMResult, AntismashDomain]: """ Generates AntismashDomain features for each provided HMMResult Arguments: gene: the CDSFeature the domains were found in domains: a list of HMMResults found in the CDSFeature Returns: a dictionary mapping the HMMResult used to the matching AntismashDomain """ new_features = {} domain_counts = defaultdict(int) # type: Dict[str, int] for domain in domains: loc = gene.get_sub_location_from_protein_coordinates(domain.query_start, domain.query_end) # set up new feature new_feature = AntismashDomain(loc, tool="nrps_pks_domains") new_feature.domain = domain.hit_id new_feature.locus_tag = gene.locus_tag or gene.get_name() new_feature.detection = "hmmscan" new_feature.database = "nrpspksdomains.hmm" new_feature.evalue = domain.evalue new_feature.score = domain.bitscore new_feature.translation = gene.translation[domain.query_start:domain.query_end + 1] domain_counts[domain.hit_id] += 1 # 1-indexed, so increment before use domain_name = "{}_{}.{}".format(gene.get_name(), domain.hit_id, domain_counts[domain.hit_id]) new_feature.domain_id = "nrpspksdomains_" + domain_name new_feature.label = domain_name new_features[domain] = new_feature return new_features
def test_simple_location_forward_complete(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 5) extracted = new.extract(self.magic) assert extracted == self.magic assert extracted.translate() == self.translation
def test_invalid_qualifier(self): cds = CDSFeature(FeatureLocation(1, 5, 1), locus_tag="test", translation="A") for bad in ["bad", ["stuff"], {}, 1]: with self.assertRaisesRegex( TypeError, "can only be set to an instance of SecMetQualifier"): cds.sec_met = bad
def test_simple_location_forward_partial(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple", translation="A") for start, end in [(1, 5), (0, 3), (2, 3), (1, 4)]: print("testing", start, end) new = cds.get_sub_location_from_protein_coordinates(start, end) print(new) extracted = new.extract(self.magic) assert extracted == self.magic[start * 3:end * 3] assert extracted.translate() == self.translation[start:end]
def test_translation_outside_record(self): rec = DummyRecord(seq="A" * 10) for location in [ FeatureLocation(0, AfterPosition(6), strand=1), FeatureLocation(BeforePosition(4), 10, strand=-1) ]: bio = SeqFeature(location, type="CDS") bio.qualifiers["translation"] = ["M" * 5] with self.assertRaisesRegex(SecmetInvalidInputError, "translation extends out of record"): CDSFeature.from_biopython(bio, record=rec)
def test_compound_location_reverse_full(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 5) assert isinstance(new, CompoundLocation) assert len(new.parts) == 3 print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new) == len(cds.location) assert new.extract( self.magic_split).translate() == self.translation[0:5]
def reverse_strand(self): self.magic = self.magic.reverse_complement() self.magic_split = self.magic_split.reverse_complement() self.sub_locations = [ FeatureLocation(loc.start, loc.end, strand=loc.strand * -1) for loc in self.sub_locations ] self.location = CompoundLocation( self.sub_locations[::self.sub_locations[0].strand]) self.cds = CDSFeature(self.location, locus_tag="compound", translation="A")
def setUp(self): self.magic_split = Seq("ATGGCAxxxxxxGGTxxxxxxATTTGT") self.magic = Seq("ATGGCAGGTATTTGT") self.translation = "MAGIC" self.sub_locations = [ FeatureLocation(0, 6, strand=1), FeatureLocation(12, 15, strand=1), FeatureLocation(21, 27, strand=1) ] self.location = CompoundLocation(self.sub_locations) self.cds = CDSFeature(self.location, locus_tag="compound", translation="A")
def test_required_identifiers(self): with self.assertRaisesRegex( ValueError, "requires at least one of: gene, protein_id, locus_tag"): CDSFeature(FeatureLocation(1, 5, 1), translation="A") assert CDSFeature(FeatureLocation(1, 5, 1), locus_tag="foo", translation="A") assert CDSFeature(FeatureLocation(1, 5, 1), protein_id="foo", translation="A") assert CDSFeature(FeatureLocation(1, 5, 1), gene="foo", translation="A")
def setUp(self): self.config = build_config(["--cf-create-clusters", "--cf-mean-threshold", "0.6", "--cf-min-cds", "5", "--cf-min-pfams", "5"], modules=[clusterfinder], isolated=True) update_config({"enabled_cluster_types": []}) self.record = DummyRecord(seq=Seq("A" * 2000)) for start, end, probability, pfam_id in [(10, 20, 0.1, 'PF77777'), (30, 40, 0.3, 'PF00106'), (50, 60, 0.4, 'PF00107'), (60, 70, 0.7, 'PF00109'), (70, 80, 0.98, 'PF08484'), (90, 100, 0.8, 'PF02401'), (100, 110, 0.32, 'PF04369'), (110, 120, 1.0, 'PF00128'), (130, 140, 0.2, 'PF77776'), (500, 505, None, 'PF77775'), (1010, 1020, 0.1, 'PF77774'), (1030, 1040, 0.3, 'PF00106'), (1050, 1060, 0.4, 'PF00107'), (1060, 1070, 0.7, 'PF00109'), (1070, 1080, 0.98, 'PF08484'), (1090, 1100, 0.8, 'PF02401'), (1100, 1110, 0.32, 'PF04369'), (1110, 1120, 1.0, 'PF00128')]: location = FeatureLocation(start, end, strand=1) self.record.add_cds_feature(CDSFeature(location, locus_tag=str(start), translation="A")) pfam = PFAMDomain(location, "dummy_description", protein_start=start + 1, protein_end=end-1, identifier=pfam_id, tool="test") pfam.domain_id = "pfam_%d" % start pfam.probability = probability self.record.add_pfam_domain(pfam)
def test_without_genefunctions(self): bio = self.convert() assert "gene_functions" not in bio.qualifiers assert "gene_kind" not in bio.qualifiers regen = CDSFeature.from_biopython(bio) assert not regen.gene_functions
def test_compound_location_reverse_multiple(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(2, 4) assert isinstance(new, CompoundLocation) print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new.parts) == 2 assert len(new) == 6 assert new.parts[0].start == 12 assert new.parts[0].end == 15 assert new.parts[1].start == 3 assert new.parts[1].end == 6 assert new.extract( self.magic_split).translate() == self.translation[2:4]
def test_mixed_strand(self): bio = self.cds.to_biopython()[0] for location in [ CompoundLocation([ FeatureLocation(1, 5, strand=-1), FeatureLocation(8, 10, strand=1) ]), CompoundLocation([ FeatureLocation(1, 5, strand=1), FeatureLocation(8, 10, strand=None) ]) ]: bio.location = location with self.assertRaisesRegex( ValueError, "compound locations with mixed strands"): CDSFeature.from_biopython(bio)
def test_bad_translation(self): loc = FeatureLocation(1, 5, 1) for trans in [None, "A?", "A!", ""]: with self.assertRaisesRegex( ValueError, "valid translation required|invalid translation characters" ): CDSFeature(loc, locus_tag="test", translation=trans)
def test_without_secmet(self): assert not self.cds.sec_met bio = self.convert() assert "sec_met" not in bio.qualifiers # for detecting legacy versions assert "sec_met_domain" not in bio.qualifiers regen = CDSFeature.from_biopython(bio) assert not regen.sec_met
def test_frameshifted_location(self): location = CompoundLocation( [FeatureLocation(3, 9, 1), FeatureLocation(8, 14, 1)]) assert len(location) == 12 seq = Seq("ATGATGAGCCCTCGTCTAGACTACAATGA") extracted = location.extract(seq) assert extracted == "ATGAGCCCCTCG" assert len(extracted) == len(location) translation = extracted.translate() assert translation == "MSPS" cds = CDSFeature(location, locus_tag="test", translation=translation) new = cds.get_sub_location_from_protein_coordinates(1, 3) assert isinstance(new, CompoundLocation) assert len(new.parts) == 2 assert new.start == 6 assert new.end == 11
def test_compound_location_reverse_single(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 2) assert isinstance(new, FeatureLocation) assert len(new) == 6 assert new.start == 21 assert new.end == 27 assert new.extract( self.magic_split).translate() == self.translation[0:2] new = cds.get_sub_location_from_protein_coordinates(2, 3) assert isinstance(new, FeatureLocation) assert len(new) == 3 assert new.start == 12 assert new.end == 15 assert new.extract( self.magic_split).translate() == self.translation[2:3]
def test_with_genefunctions(self): self.cds.gene_functions.add(GeneFunction.ADDITIONAL, "testtool", "dummy") bio = self.convert() assert "gene_functions" in bio.qualifiers assert bio.qualifiers["gene_kind"] == [str( self.cds.gene_function)] == ["biosynthetic-additional"] regen = CDSFeature.from_biopython(bio) assert regen.gene_function == self.cds.gene_function assert regen.gene_functions.get_by_tool( "testtool") == self.cds.gene_functions.get_by_tool("testtool")
def test_basics(self): bio = self.convert() assert bio.location == self.cds.location assert bio.qualifiers["locus_tag"] == ["loctag"] assert bio.qualifiers["gene"] == ["gene"] assert bio.qualifiers["protein_id"] == ["prot_id"] assert bio.qualifiers["translation"] == ["A" * 4] regen = CDSFeature.from_biopython(bio) assert regen.location == self.cds.location assert regen.locus_tag == self.cds.locus_tag assert regen.gene == self.cds.gene assert regen.protein_id == self.cds.protein_id
def test_with_secmet(self): domains = [ SecMetQualifier.Domain("testA", 0.1, 1.1, 3, "test"), SecMetQualifier.Domain("testB", 5.1, 3.9, 5, "dummy") ] self.cds.sec_met = SecMetQualifier(domains) bio = self.convert() assert "sec_met" not in bio.qualifiers # again, detecting leftover legacy versions assert len(bio.qualifiers["sec_met_domain"]) == 2 assert bio.qualifiers["sec_met_domain"] == list(map(str, domains)) regen = CDSFeature.from_biopython(bio) assert regen.sec_met assert len(regen.sec_met.domains) == len(domains) assert regen.sec_met.domains == domains
def setUp(self): self.cds = CDSFeature(FeatureLocation(0, 12, 1), translation="A" * 4, locus_tag="loctag", gene="gene", protein_id="prot_id")
class TestCDSBiopythonConversion(unittest.TestCase): def setUp(self): self.cds = CDSFeature(FeatureLocation(0, 12, 1), translation="A" * 4, locus_tag="loctag", gene="gene", protein_id="prot_id") def convert(self): bio_features = self.cds.to_biopython() assert isinstance(bio_features, list) assert len(bio_features) == 1 return bio_features[0] def test_basics(self): bio = self.convert() assert bio.location == self.cds.location assert bio.qualifiers["locus_tag"] == ["loctag"] assert bio.qualifiers["gene"] == ["gene"] assert bio.qualifiers["protein_id"] == ["prot_id"] assert bio.qualifiers["translation"] == ["A" * 4] regen = CDSFeature.from_biopython(bio) assert regen.location == self.cds.location assert regen.locus_tag == self.cds.locus_tag assert regen.gene == self.cds.gene assert regen.protein_id == self.cds.protein_id def test_without_genefunctions(self): bio = self.convert() assert "gene_functions" not in bio.qualifiers assert "gene_kind" not in bio.qualifiers regen = CDSFeature.from_biopython(bio) assert not regen.gene_functions def test_with_genefunctions(self): self.cds.gene_functions.add(GeneFunction.ADDITIONAL, "testtool", "dummy") bio = self.convert() assert "gene_functions" in bio.qualifiers assert bio.qualifiers["gene_kind"] == [str( self.cds.gene_function)] == ["biosynthetic-additional"] regen = CDSFeature.from_biopython(bio) assert regen.gene_function == self.cds.gene_function assert regen.gene_functions.get_by_tool( "testtool") == self.cds.gene_functions.get_by_tool("testtool") def test_without_secmet(self): assert not self.cds.sec_met bio = self.convert() assert "sec_met" not in bio.qualifiers # for detecting legacy versions assert "sec_met_domain" not in bio.qualifiers regen = CDSFeature.from_biopython(bio) assert not regen.sec_met def test_with_secmet(self): domains = [ SecMetQualifier.Domain("testA", 0.1, 1.1, 3, "test"), SecMetQualifier.Domain("testB", 5.1, 3.9, 5, "dummy") ] self.cds.sec_met = SecMetQualifier(domains) bio = self.convert() assert "sec_met" not in bio.qualifiers # again, detecting leftover legacy versions assert len(bio.qualifiers["sec_met_domain"]) == 2 assert bio.qualifiers["sec_met_domain"] == list(map(str, domains)) regen = CDSFeature.from_biopython(bio) assert regen.sec_met assert len(regen.sec_met.domains) == len(domains) assert regen.sec_met.domains == domains def test_mixed_strand(self): bio = self.cds.to_biopython()[0] for location in [ CompoundLocation([ FeatureLocation(1, 5, strand=-1), FeatureLocation(8, 10, strand=1) ]), CompoundLocation([ FeatureLocation(1, 5, strand=1), FeatureLocation(8, 10, strand=None) ]) ]: bio.location = location with self.assertRaisesRegex( ValueError, "compound locations with mixed strands"): CDSFeature.from_biopython(bio)
class TestCDSBiopythonConversion(unittest.TestCase): def setUp(self): self.cds = CDSFeature(FeatureLocation(0, 12, 1), translation="A" * 4, locus_tag="loctag", gene="gene", protein_id="prot_id") def convert(self): bio_features = self.cds.to_biopython() assert isinstance(bio_features, list) assert len(bio_features) == 1 return bio_features[0] def test_basics(self): bio = self.convert() assert bio.location == self.cds.location assert bio.qualifiers["locus_tag"] == ["loctag"] assert bio.qualifiers["gene"] == ["gene"] assert bio.qualifiers["protein_id"] == ["prot_id"] assert bio.qualifiers["translation"] == ["A" * 4] regen = CDSFeature.from_biopython(bio) assert regen.location == self.cds.location assert regen.locus_tag == self.cds.locus_tag assert regen.gene == self.cds.gene assert regen.protein_id == self.cds.protein_id def test_without_genefunctions(self): bio = self.convert() assert "gene_functions" not in bio.qualifiers assert "gene_kind" not in bio.qualifiers regen = CDSFeature.from_biopython(bio) assert not regen.gene_functions def test_with_genefunctions(self): self.cds.gene_functions.add(GeneFunction.ADDITIONAL, "testtool", "dummy") bio = self.convert() assert "gene_functions" in bio.qualifiers assert bio.qualifiers["gene_kind"] == [str( self.cds.gene_function)] == ["biosynthetic-additional"] regen = CDSFeature.from_biopython(bio) assert regen.gene_function == self.cds.gene_function assert regen.gene_functions.get_by_tool( "testtool") == self.cds.gene_functions.get_by_tool("testtool") def test_without_secmet(self): assert not self.cds.sec_met bio = self.convert() assert "sec_met" not in bio.qualifiers # for detecting legacy versions assert "sec_met_domain" not in bio.qualifiers regen = CDSFeature.from_biopython(bio) assert not regen.sec_met def test_with_secmet(self): domains = [ SecMetQualifier.Domain("testA", 0.1, 1.1, 3, "test"), SecMetQualifier.Domain("testB", 5.1, 3.9, 5, "dummy") ] self.cds.sec_met = SecMetQualifier(domains) bio = self.convert() assert "sec_met" not in bio.qualifiers # again, detecting leftover legacy versions assert len(bio.qualifiers["sec_met_domain"]) == 2 assert bio.qualifiers["sec_met_domain"] == list(map(str, domains)) regen = CDSFeature.from_biopython(bio) assert regen.sec_met assert len(regen.sec_met.domains) == len(domains) assert regen.sec_met.domains == domains def test_mixed_strand(self): bio = self.cds.to_biopython()[0] for location in [ CompoundLocation([ FeatureLocation(1, 5, strand=-1), FeatureLocation(8, 10, strand=1) ]), CompoundLocation([ FeatureLocation(1, 5, strand=1), FeatureLocation(8, 10, strand=None) ]) ]: bio.location = location with self.assertRaisesRegex( ValueError, "compound locations with mixed strands"): CDSFeature.from_biopython(bio) # compound locations starting with an invalid strand will be treated as per a non-compound wtih a bad strand def test_translation_outside_record(self): rec = DummyRecord(seq="A" * 10) for location in [ FeatureLocation(0, AfterPosition(6), strand=1), FeatureLocation(BeforePosition(4), 10, strand=-1) ]: bio = SeqFeature(location, type="CDS") bio.qualifiers["translation"] = ["M" * 5] with self.assertRaisesRegex(SecmetInvalidInputError, "translation extends out of record"): CDSFeature.from_biopython(bio, record=rec) def test_invalid_translation_table(self): bio = self.cds.to_biopython()[0] bio.qualifiers["transl_table"] = ["11a"] with self.assertRaisesRegex(SecmetInvalidInputError, "invalid translation table"): CDSFeature.from_biopython(bio)
def test_bad_strand(self): with self.assertRaisesRegex(ValueError, "Strand must be"): CDSFeature(FeatureLocation(1, 5, 0), locus_tag="test", translation="A")
def test_bad_strand(self): for strand in [0, None]: with self.assertRaisesRegex(ValueError, "invalid strand"): CDSFeature(FeatureLocation(1, 5, strand), locus_tag="test", translation="A")
def test_complicated(self): parts = [ FeatureLocation(121124, 122061, 1), FeatureLocation(122339, 122383, 1), FeatureLocation(122559, 122666, 1), FeatureLocation(122712, 122874, 1), FeatureLocation(123060, 123337, 1), FeatureLocation(123481, 123749, 1), FeatureLocation(123809, 124032, 1), FeatureLocation(124091, 124193, 1), FeatureLocation(124236, 124401, 1), FeatureLocation(124684, 124724, 1) ] location = CompoundLocation(parts, operator="join") cds = CDSFeature(location, locus_tag="complicated", translation="A") seq = ( "ATGAGCCCTCGTCTAGACTACAATGAAGGATACGATTCCGAAGACGAGGAGATCCCCCGTTACGTACACCAT" "TCTAGAGGAAAGAGTCATAGATCCGTGAGGACGTCAGGTCGCTCACGCACGTTGGATTACGACGGGGATGAT" "GAAGCTAGTGACCACGCTGCCCCCTCCGGGATTGATCGGGACGCTCGAGCCTGTCCAACATCTCGCAGATAT" "ACTGATGACTGCCTTGAGACACATAAATTTCGAGGTGCCCGCTCCTCTCGCTCCCGTGGACGAACCGATGAT" "AACAAGGTTTTGTACTACACCAAGTATCGCAGCCCGGCTAAGGACTTGCCTATCGAGCGTGATCCCGAGGGT" "ATTAATTTATTCAAGGTCCGACAGCACACACGGCCAAGTGACGCTCATGTGCCCAGTGGATACCGTGAGCCC" "TACGAAGTCAAGGTCGACGAGTATGAGGATGATCATCCCCGTACATGCACTAGCCGCCGTGACTCTAGACAG" "CCGAAAGTCTACAAGGTCCGGGTTGATGAGTACGAGGATAACCTCCCTGCACGCTCTCACACTGACTTTCGC" "GAGTCTCCACGGTCTGAAAGATGCTCTAGCCGCTACACCGAGGACTCGAAGCCTGGGGAGCTTCCTCCCCGC" "TCAGGGCCCTGTCGGTCCAGCAGGCCTTCTCCGGTCGATGAGGACGTCGAGTATGAGATCCGTGAGCCCCGA" "GGGCATCGCTCCAGTCGACACTCTACAGATGTTGACTTTCAGCCAGTAGAACAACATCCTCGCTTTGGACAA" "CGTGGACTCAGCAGACCTTCGCGGGTTGATGAGGAAGTCGATTATGAGATCCGTGAGCCCCGTGGCAATCGT" "GTCAGTCACGCTGCTCATGGTGACAGCCCCTGTCAGGACCAAAGCTCCAGGCATATCGGCATTCAATTGTGG" "AGTACGCGCGGACCCCGGGCGGCTGGCCGTGGCCGGGGTCCTGATGAGTCTGACGATGTTGAGCCCTAGGCA" "GGGAATTGCCGTAATGCTCTTCAAACTGTATAGCAAGCTCAGCATCAATTCTTTAACTGGCAGGCGCTCTGC" "TCGCGCGTTTCTCTCTTGGGGTGGTTGGTTTGACTGTAGATTTCCTCTTTCAAGGCTTCTAGATACACCTTT" "GGAAGATAGCAACGCTATGCAAGATATTTTTGATAATTCAAATCCTTTTTACACATGGAATAGCTGGTGTTC" "CTGTTTTATCTAGGCAATTGACCCACGCCATCTCGGTAGGTACGGTAAAAGCAAGCCGTAATCTCGTATGGC" "TTCATCCTTAGCATCGTATAGATCTCCACTCGGGACTCGGCCAGGGATCTTCCATCAATCAACGTGAAGAAG" "TCCAGCACCCCGCTGAATCATAATATCCTACCGATTCTGCTCTCTTCACCTCTAGATACCCCTCTAGACTCC" "TGTCAACATGTTCCGTACAGTCGAAGACCGCCCGACCCCAAAAGAGGTATATAACTGGCGGCTGTACACCGA" "GGCCACCATCATTGCCACTGGTACACTCTTGTGAGTAGGTGCTGTTGTAACGAAAAACATCCAACTGATCCG" "CCAGGTTCGGCTATGACTCGGCTTTTGTGGGAACTACCATTGCCCGCCAAAGCTTCGTTGATGCCTTCAACA" "TCGTCGAGTCGGAGGCGGCGGATATTTCAAGCAATATCACGTCAACCTTTCAGGCCGGCGCATTTTTCGGCG" "CCATCTTCTGCTTCTTGCCTGAGTGAAGCCGTTAGAGACGGTCTCACTGGCTAACCGGACCAAGTGACCGAC" "AAAATTGGGCGTAAATGGGCCCTTCAGGCAAACACACTGCTGTTTCTTATTGGCGCGATTGTGATGACGGCT" "GCAACACATCACCTTTCCTATATATGTAAGTCATATCCCCGTAGTAGTCAAGGTTGTTAACTAGAGCAGATG" "CTGGACGAGCTCTCACCGGCATCGCATGCGGCGCTATCACCGCGACCGTCCCCAGCTATATTGCCGAGCTGT" "CAATCGTGTCGATCCGGGGCTTCCTCACCGGGTTCTTCGAAGTCGCATACCAGATTGGTAGCTTGGTTGGAT" "TCTGGATCAACTATGGCATTAACGAGAACATGGACAACTCCTCGGCCGCAAGCTGGAGAGTGCCTATGGCAG" "TCCAGATCATCCCCGCAGGAGTCCTTTTCATTGGTGGCTTTTCCTCCATGAGAGTCCTCTCTGGCTGATGCG" "AAAAGACAGTGAGGATGCCGCGACGGCTGCCCTGGAGGCGTTGAGGAAACTGCCACGGTCTCATCAATGTAA" "TCTCCCACCAAGACTCAGGACATAGTCCCATGCTGACTATTTTAGATGTCCAGGAAGACATCGAGATGAACC" "GCACCAGGCTGCTGGAGGAAGCTCGGATCGCCGAGAAGTACGGACAAGGTTGGTTGGCATATATCCGAGGCG" "CACTCTTCGAGCTCTCGCGCCATGGGATGTGGAATCGTGTTCTGCTCGTCCTCTGTGCCTTTGCACTGCAGA" "ATATGTCGGGAGCTGCTGCTATCAACTACTATTCCCCCATACTCTTTGCGTCGTTGGGGATCACTGATGTCG" "CTCTGTATACAGGTATTTATGGCCTGGTAAAAGGTAAGTTCTTCTCCTTAAGTATCTCTGGCTGACAATAGG" "GATTAACTGATGAGTTTACAGCCGTCGCATCAATTATATTCTACGGCATTCTCATTGATATGTGGGGCCGCC" "GACGTCCGACCATTGTTTCGTCACTGGCCTGCCCTCTATGTCTCTGGTTTGTGGGTGCATACGTCAAAGTTG" "GGCATCCAGCCGATATCATAGACGCCGGCGGGGAATTGTCCCCCTCCACGGAGGCTGGTGGTAGAGCGGCGA" "CTGCGATGATTATGATCTACTCCGTCTTGTAAGTGCCCCTCACTTTTGAATGGGCTTCAGCTTGGAACTCGA" "GTAACTGGTATCCAGTTGGTCTTTTGGTCTCAACGGTATCCCCTGGATTGTCTCCGCCGAAATCTTCCCCGG" "CGCGCTGCGAAATCTCACGGGGACATGGGCTGCGCTGGTGCAATGGTATGCAATTCCCTTCACCTAGTATCC" "ATATCTAAATCAGCAGGTTGATCCAATTCGTTATCACCAAAGCTCTCCCGTACATCTTCAATAGCCTTGGGT" "ACGGGACGTGGTTCTTCTTCGCCTCCTGGATGCTGCTCGCTATCATTTGGTCATTCTTTTTTCTCCCGGAAA" "CCAAGGGGAAGACTCTCGATGAAATGCATACGATCTTGTACGTTTCTCTCCGTCGAAATGTGGTCTTGGCTA" "ATGAATCAGCGGCCATTCTCTCGCCGAAGAGCAGGGTAAGGGTGAGGTTCGAGATAACACTACTAAAAGTGA" "TCGGGAGGCTGTCTAGTCCAGTAGTTCTAGAGGACTATTGGCTGGATGATTCCTCTGATGATTTTTGATTGG" "TGGTGAAAATGTTGGATGTTTAATGCCAATGTACTGGGAGAGAACATGCCGATAGTACATACCGCTGTGTTG" "TATATCGAAGACGGTTGATTTATATATCTTAGTCTTTCAAAAGACGGCACTCACACAATCACACTTCGATGA" ) translation = ( "MSPRLDYNEGYDSEDEEIPRYVHHSRGKSHRSVRTSGRSRTLDYDGDDEASDHAAPSGIDRDAR" "ACPTSRRYTDDCLETHKFRGARSSRSRGRTDDNKVLYYTKYRSPAKDLPIERDPEGINLFKVRQ" "HTRPSDAHVPSGYREPYEVKVDEYEDDHPRTCTSRRDSRQPKVYKVRVDEYEDNLPARSHTDFR" "ESPRSERCSSRYTEDSKPGELPPRSGPCRSSRPSPVDEDVEYEIREPRGHRSSRHSTDVDFQPV" "EQHPRFGQRGLSRPSRVDEEVDYEIREPRGNRVSHAAHGDSPCQDQSSRHIGIQLWTGVPVLSR" "QLTHAISTPVNMFRTVEDRPTPKEVYNWRLYTEATIIATGTLLFGYDSAFVGTTIARQSFVDAF" "NIVESEAADISSNITSTFQAGAFFGAIFCFLPEADAGRALTGIACGAITATVPSYIAELSIVSI" "RGFLTGFFEVAYQIGSLVGFWINYGINENMDNSSAASWRVPMAVQIIPAGVLFIGGFSSMREDI" "EMNRTRLLEEARIAEKYGQGWLAYIRGALFELSRHGMWNRVLLVLCAFALQNMSGAAAINYYSP" "ILFASLGITDVALYTGIYGLVKAVASIIFYGILIDMWGRRRPTIVSSLACPLCLWFVGAYVKVG" "HPADIIDAGGELSPSTEAGGRAATAMIMIYSVFWSFGLNGIPWIVSAEIFPGALRNLTGTWAAL" "VQWLIQFVITKALPYIFNSLGYGTWFFFASWMLLAIIWSFFFLPETKGKTLDEMHTIFLSKDGT" "HTITLR") new = cds.get_sub_location_from_protein_coordinates(353, 412) # pad the beginning to match the location assert new.extract(Seq("x" * location.start + seq)).translate() == translation[353:412]
class TestCDSProteinLocation(unittest.TestCase): def setUp(self): self.magic_split = Seq("ATGGCAxxxxxxGGTxxxxxxATTTGT") self.magic = Seq("ATGGCAGGTATTTGT") self.translation = "MAGIC" self.sub_locations = [ FeatureLocation(0, 6, strand=1), FeatureLocation(12, 15, strand=1), FeatureLocation(21, 27, strand=1) ] self.location = CompoundLocation(self.sub_locations) self.cds = CDSFeature(self.location, locus_tag="compound", translation="A") def reverse_strand(self): self.magic = self.magic.reverse_complement() self.magic_split = self.magic_split.reverse_complement() self.sub_locations = [ FeatureLocation(loc.start, loc.end, strand=loc.strand * -1) for loc in self.sub_locations ] self.location = CompoundLocation( self.sub_locations[::self.sub_locations[0].strand]) self.cds = CDSFeature(self.location, locus_tag="compound", translation="A") def test_simple_location_forward_complete(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 5) extracted = new.extract(self.magic) assert extracted == self.magic assert extracted.translate() == self.translation def test_simple_location_forward_partial(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple", translation="A") for start, end in [(1, 5), (0, 3), (2, 3), (1, 4)]: print("testing", start, end) new = cds.get_sub_location_from_protein_coordinates(start, end) print(new) extracted = new.extract(self.magic) assert extracted == self.magic[start * 3:end * 3] assert extracted.translate() == self.translation[start:end] def test_compound_location_forward_full(self): new = self.cds.get_sub_location_from_protein_coordinates(0, 5) assert isinstance(new, CompoundLocation) assert len(new.parts) == 3 print(list(map(str, self.cds.location.parts))) print(list(map(str, new.parts))) assert len(new) == len(self.cds.location) assert new == self.location, "%s != %s" % (str(new), str( self.location)) extracted = new.extract(self.magic_split) assert extracted == self.magic assert extracted.translate() == self.translation[0:5] def test_compound_forward_within_single(self): new = self.cds.get_sub_location_from_protein_coordinates(0, 2) assert isinstance(new, FeatureLocation) assert len(new) == 6 assert new.start == 0 assert new.end == 6 assert new.extract( self.magic_split).translate() == self.translation[0:2] new = self.cds.get_sub_location_from_protein_coordinates(2, 3) assert isinstance(new, FeatureLocation) assert len(new) == 3 assert new.start == 12 assert new.end == 15 assert new.extract( self.magic_split).translate() == self.translation[2:3] def test_compound_forward_over_multiple(self): new = self.cds.get_sub_location_from_protein_coordinates(2, 4) assert isinstance(new, CompoundLocation) print(list(map(str, self.cds.location.parts))) print(list(map(str, new.parts))) assert len(new.parts) == 2 assert len(new) == 6 assert new.parts[0].start == 12 assert new.parts[0].end == 15 assert new.parts[1].start == 21 assert new.parts[1].end == 24 assert new.extract( self.magic_split).translate() == self.translation[2:4] def test_compound_location_reverse_full(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 5) assert isinstance(new, CompoundLocation) assert len(new.parts) == 3 print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new) == len(cds.location) assert new.extract( self.magic_split).translate() == self.translation[0:5] def test_compound_location_reverse_single(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 2) assert isinstance(new, FeatureLocation) assert len(new) == 6 assert new.start == 21 assert new.end == 27 assert new.extract( self.magic_split).translate() == self.translation[0:2] new = cds.get_sub_location_from_protein_coordinates(2, 3) assert isinstance(new, FeatureLocation) assert len(new) == 3 assert new.start == 12 assert new.end == 15 assert new.extract( self.magic_split).translate() == self.translation[2:3] def test_compound_location_reverse_multiple(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(2, 4) assert isinstance(new, CompoundLocation) print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new.parts) == 2 assert len(new) == 6 assert new.parts[0].start == 12 assert new.parts[0].end == 15 assert new.parts[1].start == 3 assert new.parts[1].end == 6 assert new.extract( self.magic_split).translate() == self.translation[2:4] def test_frameshifted_location(self): location = CompoundLocation( [FeatureLocation(3, 9, 1), FeatureLocation(8, 14, 1)]) assert len(location) == 12 seq = Seq("ATGATGAGCCCTCGTCTAGACTACAATGA") extracted = location.extract(seq) assert extracted == "ATGAGCCCCTCG" assert len(extracted) == len(location) translation = extracted.translate() assert translation == "MSPS" cds = CDSFeature(location, locus_tag="test", translation=translation) new = cds.get_sub_location_from_protein_coordinates(1, 3) assert isinstance(new, CompoundLocation) assert len(new.parts) == 2 assert new.start == 6 assert new.end == 11 def test_complicated(self): parts = [ FeatureLocation(121124, 122061, 1), FeatureLocation(122339, 122383, 1), FeatureLocation(122559, 122666, 1), FeatureLocation(122712, 122874, 1), FeatureLocation(123060, 123337, 1), FeatureLocation(123481, 123749, 1), FeatureLocation(123809, 124032, 1), FeatureLocation(124091, 124193, 1), FeatureLocation(124236, 124401, 1), FeatureLocation(124684, 124724, 1) ] location = CompoundLocation(parts, operator="join") cds = CDSFeature(location, locus_tag="complicated", translation="A") seq = ( "ATGAGCCCTCGTCTAGACTACAATGAAGGATACGATTCCGAAGACGAGGAGATCCCCCGTTACGTACACCAT" "TCTAGAGGAAAGAGTCATAGATCCGTGAGGACGTCAGGTCGCTCACGCACGTTGGATTACGACGGGGATGAT" "GAAGCTAGTGACCACGCTGCCCCCTCCGGGATTGATCGGGACGCTCGAGCCTGTCCAACATCTCGCAGATAT" "ACTGATGACTGCCTTGAGACACATAAATTTCGAGGTGCCCGCTCCTCTCGCTCCCGTGGACGAACCGATGAT" "AACAAGGTTTTGTACTACACCAAGTATCGCAGCCCGGCTAAGGACTTGCCTATCGAGCGTGATCCCGAGGGT" "ATTAATTTATTCAAGGTCCGACAGCACACACGGCCAAGTGACGCTCATGTGCCCAGTGGATACCGTGAGCCC" "TACGAAGTCAAGGTCGACGAGTATGAGGATGATCATCCCCGTACATGCACTAGCCGCCGTGACTCTAGACAG" "CCGAAAGTCTACAAGGTCCGGGTTGATGAGTACGAGGATAACCTCCCTGCACGCTCTCACACTGACTTTCGC" "GAGTCTCCACGGTCTGAAAGATGCTCTAGCCGCTACACCGAGGACTCGAAGCCTGGGGAGCTTCCTCCCCGC" "TCAGGGCCCTGTCGGTCCAGCAGGCCTTCTCCGGTCGATGAGGACGTCGAGTATGAGATCCGTGAGCCCCGA" "GGGCATCGCTCCAGTCGACACTCTACAGATGTTGACTTTCAGCCAGTAGAACAACATCCTCGCTTTGGACAA" "CGTGGACTCAGCAGACCTTCGCGGGTTGATGAGGAAGTCGATTATGAGATCCGTGAGCCCCGTGGCAATCGT" "GTCAGTCACGCTGCTCATGGTGACAGCCCCTGTCAGGACCAAAGCTCCAGGCATATCGGCATTCAATTGTGG" "AGTACGCGCGGACCCCGGGCGGCTGGCCGTGGCCGGGGTCCTGATGAGTCTGACGATGTTGAGCCCTAGGCA" "GGGAATTGCCGTAATGCTCTTCAAACTGTATAGCAAGCTCAGCATCAATTCTTTAACTGGCAGGCGCTCTGC" "TCGCGCGTTTCTCTCTTGGGGTGGTTGGTTTGACTGTAGATTTCCTCTTTCAAGGCTTCTAGATACACCTTT" "GGAAGATAGCAACGCTATGCAAGATATTTTTGATAATTCAAATCCTTTTTACACATGGAATAGCTGGTGTTC" "CTGTTTTATCTAGGCAATTGACCCACGCCATCTCGGTAGGTACGGTAAAAGCAAGCCGTAATCTCGTATGGC" "TTCATCCTTAGCATCGTATAGATCTCCACTCGGGACTCGGCCAGGGATCTTCCATCAATCAACGTGAAGAAG" "TCCAGCACCCCGCTGAATCATAATATCCTACCGATTCTGCTCTCTTCACCTCTAGATACCCCTCTAGACTCC" "TGTCAACATGTTCCGTACAGTCGAAGACCGCCCGACCCCAAAAGAGGTATATAACTGGCGGCTGTACACCGA" "GGCCACCATCATTGCCACTGGTACACTCTTGTGAGTAGGTGCTGTTGTAACGAAAAACATCCAACTGATCCG" "CCAGGTTCGGCTATGACTCGGCTTTTGTGGGAACTACCATTGCCCGCCAAAGCTTCGTTGATGCCTTCAACA" "TCGTCGAGTCGGAGGCGGCGGATATTTCAAGCAATATCACGTCAACCTTTCAGGCCGGCGCATTTTTCGGCG" "CCATCTTCTGCTTCTTGCCTGAGTGAAGCCGTTAGAGACGGTCTCACTGGCTAACCGGACCAAGTGACCGAC" "AAAATTGGGCGTAAATGGGCCCTTCAGGCAAACACACTGCTGTTTCTTATTGGCGCGATTGTGATGACGGCT" "GCAACACATCACCTTTCCTATATATGTAAGTCATATCCCCGTAGTAGTCAAGGTTGTTAACTAGAGCAGATG" "CTGGACGAGCTCTCACCGGCATCGCATGCGGCGCTATCACCGCGACCGTCCCCAGCTATATTGCCGAGCTGT" "CAATCGTGTCGATCCGGGGCTTCCTCACCGGGTTCTTCGAAGTCGCATACCAGATTGGTAGCTTGGTTGGAT" "TCTGGATCAACTATGGCATTAACGAGAACATGGACAACTCCTCGGCCGCAAGCTGGAGAGTGCCTATGGCAG" "TCCAGATCATCCCCGCAGGAGTCCTTTTCATTGGTGGCTTTTCCTCCATGAGAGTCCTCTCTGGCTGATGCG" "AAAAGACAGTGAGGATGCCGCGACGGCTGCCCTGGAGGCGTTGAGGAAACTGCCACGGTCTCATCAATGTAA" "TCTCCCACCAAGACTCAGGACATAGTCCCATGCTGACTATTTTAGATGTCCAGGAAGACATCGAGATGAACC" "GCACCAGGCTGCTGGAGGAAGCTCGGATCGCCGAGAAGTACGGACAAGGTTGGTTGGCATATATCCGAGGCG" "CACTCTTCGAGCTCTCGCGCCATGGGATGTGGAATCGTGTTCTGCTCGTCCTCTGTGCCTTTGCACTGCAGA" "ATATGTCGGGAGCTGCTGCTATCAACTACTATTCCCCCATACTCTTTGCGTCGTTGGGGATCACTGATGTCG" "CTCTGTATACAGGTATTTATGGCCTGGTAAAAGGTAAGTTCTTCTCCTTAAGTATCTCTGGCTGACAATAGG" "GATTAACTGATGAGTTTACAGCCGTCGCATCAATTATATTCTACGGCATTCTCATTGATATGTGGGGCCGCC" "GACGTCCGACCATTGTTTCGTCACTGGCCTGCCCTCTATGTCTCTGGTTTGTGGGTGCATACGTCAAAGTTG" "GGCATCCAGCCGATATCATAGACGCCGGCGGGGAATTGTCCCCCTCCACGGAGGCTGGTGGTAGAGCGGCGA" "CTGCGATGATTATGATCTACTCCGTCTTGTAAGTGCCCCTCACTTTTGAATGGGCTTCAGCTTGGAACTCGA" "GTAACTGGTATCCAGTTGGTCTTTTGGTCTCAACGGTATCCCCTGGATTGTCTCCGCCGAAATCTTCCCCGG" "CGCGCTGCGAAATCTCACGGGGACATGGGCTGCGCTGGTGCAATGGTATGCAATTCCCTTCACCTAGTATCC" "ATATCTAAATCAGCAGGTTGATCCAATTCGTTATCACCAAAGCTCTCCCGTACATCTTCAATAGCCTTGGGT" "ACGGGACGTGGTTCTTCTTCGCCTCCTGGATGCTGCTCGCTATCATTTGGTCATTCTTTTTTCTCCCGGAAA" "CCAAGGGGAAGACTCTCGATGAAATGCATACGATCTTGTACGTTTCTCTCCGTCGAAATGTGGTCTTGGCTA" "ATGAATCAGCGGCCATTCTCTCGCCGAAGAGCAGGGTAAGGGTGAGGTTCGAGATAACACTACTAAAAGTGA" "TCGGGAGGCTGTCTAGTCCAGTAGTTCTAGAGGACTATTGGCTGGATGATTCCTCTGATGATTTTTGATTGG" "TGGTGAAAATGTTGGATGTTTAATGCCAATGTACTGGGAGAGAACATGCCGATAGTACATACCGCTGTGTTG" "TATATCGAAGACGGTTGATTTATATATCTTAGTCTTTCAAAAGACGGCACTCACACAATCACACTTCGATGA" ) translation = ( "MSPRLDYNEGYDSEDEEIPRYVHHSRGKSHRSVRTSGRSRTLDYDGDDEASDHAAPSGIDRDAR" "ACPTSRRYTDDCLETHKFRGARSSRSRGRTDDNKVLYYTKYRSPAKDLPIERDPEGINLFKVRQ" "HTRPSDAHVPSGYREPYEVKVDEYEDDHPRTCTSRRDSRQPKVYKVRVDEYEDNLPARSHTDFR" "ESPRSERCSSRYTEDSKPGELPPRSGPCRSSRPSPVDEDVEYEIREPRGHRSSRHSTDVDFQPV" "EQHPRFGQRGLSRPSRVDEEVDYEIREPRGNRVSHAAHGDSPCQDQSSRHIGIQLWTGVPVLSR" "QLTHAISTPVNMFRTVEDRPTPKEVYNWRLYTEATIIATGTLLFGYDSAFVGTTIARQSFVDAF" "NIVESEAADISSNITSTFQAGAFFGAIFCFLPEADAGRALTGIACGAITATVPSYIAELSIVSI" "RGFLTGFFEVAYQIGSLVGFWINYGINENMDNSSAASWRVPMAVQIIPAGVLFIGGFSSMREDI" "EMNRTRLLEEARIAEKYGQGWLAYIRGALFELSRHGMWNRVLLVLCAFALQNMSGAAAINYYSP" "ILFASLGITDVALYTGIYGLVKAVASIIFYGILIDMWGRRRPTIVSSLACPLCLWFVGAYVKVG" "HPADIIDAGGELSPSTEAGGRAATAMIMIYSVFWSFGLNGIPWIVSAEIFPGALRNLTGTWAAL" "VQWLIQFVITKALPYIFNSLGYGTWFFFASWMLLAIIWSFFFLPETKGKTLDEMHTIFLSKDGT" "HTITLR") new = cds.get_sub_location_from_protein_coordinates(353, 412) # pad the beginning to match the location assert new.extract(Seq("x" * location.start + seq)).translate() == translation[353:412] def test_extends_past_after(self): self.sub_locations[-1] = FeatureLocation(21, AfterPosition(29), strand=1) self.cds.location = CompoundLocation(self.sub_locations) new = self.cds.get_sub_location_from_protein_coordinates(0, 7) assert new.end == 27 def test_extends_past_before(self): self.reverse_strand() self.sub_locations[0] = FeatureLocation(BeforePosition(2), self.sub_locations[0].end, strand=-1) self.cds.location = CompoundLocation(self.sub_locations[::-1]) new = self.cds.get_sub_location_from_protein_coordinates(0, 7) assert new.start == 3
def test_invalid_translation_table(self): bio = self.cds.to_biopython()[0] bio.qualifiers["transl_table"] = ["11a"] with self.assertRaisesRegex(SecmetInvalidInputError, "invalid translation table"): CDSFeature.from_biopython(bio)
def __init__(self, feature: CDSFeature) -> None: super().__init__(["id", "sequence", "domains", "modules"]) self.sequence = feature.translation self.id = feature.get_name() self.domains = [] # type: List[JSONDomain] self.modules = [] # type: List[JSONModule]