コード例 #1
0
class TestMutation():
    def setup(self):
        self.reference_filepath = "src/mykrobe/data/NC_000962.3.fasta"
        self.reference = os.path.basename(
            self.reference_filepath).split('.fa')[0]
        self.aa2dna = GeneAminoAcidChangeToDNAVariants(
            "src/mykrobe/data/NC_000962.3.fasta",
            "src/mykrobe/data/NC_000962.3.gb")

    def teardown(self):
        pass

    def test_mutation_name_forward_strand(self):
        gene = "rpoB"
        mutation_string = "S450L"
        is_protein_coding_var = True
        assert set(
            self.aa2dna.get_variant_names(gene, mutation_string,
                                          is_protein_coding_var)) == set([
                                              "TCG761154TTA", "TCG761154TTG",
                                              "TCG761154CTA", "TCG761154CTT",
                                              "TCG761154CTC", "TCG761154CTG"
                                          ])
        mutation = Mutation(reference=self.reference,
                            var_name="TCG761154TTA",
                            gene=self.aa2dna.get_gene("rpoB"),
                            mut="S450L")
        assert mutation.mutation_output_name == "S450L"

    def test_mutation_name_reverse_strand(self):
        gene = "gid"
        mutation_string = "I11N"
        is_protein_coding_var = True
        assert set(
            self.aa2dna.get_variant_names(gene, mutation_string,
                                          is_protein_coding_var)) == set([
                                              "GAT4408170ATT", "GAT4408170GTT"
                                          ])
        mutation = Mutation(reference=self.reference,
                            var_name="GAT4408170ATT",
                            gene=self.aa2dna.get_gene("gid"),
                            mut="I11N")
        assert mutation.mutation_output_name == "I11N"

    def test_mutation_name_dna_space(self):
        gene = "pncA"
        mutation_string = "C18CCA"
        is_protein_coding_var = False
        assert set(
            self.aa2dna.get_variant_names(gene, mutation_string,
                                          is_protein_coding_var)) == set(
                                              ["G2289224TGG"])
        mutation = Mutation(reference=self.reference,
                            var_name=self.aa2dna.get_variant_names(
                                gene, mutation_string,
                                is_protein_coding_var)[0],
                            gene=self.aa2dna.get_gene(gene),
                            mut=mutation_string)
        assert mutation.mutation_output_name == "C18CCA"
コード例 #2
0
class TestRegions():
    def teardown(self):
        DB.drop_database('mykrobe-test')

    def setup(self):
        DB.drop_database('mykrobe-test')
        with open("src/mykrobe/data/NC_000962.3.fasta", 'r') as infile:
            self.reference_seq = list(SeqIO.parse(infile, "fasta"))[0].seq

        self.gm = GeneAminoAcidChangeToDNAVariants(
            reference="src/mykrobe/data/NC_000962.3.fasta",
            genbank="src/mykrobe/data/NC_000962.3.gb")
        self.reference_set = ReferenceSet().create_and_save(name="ref_set")
        self.variant_set = VariantSet.create_and_save(
            name="this_vcf_file", reference_set=self.reference_set)
        self.variant_sets = [self.variant_set]
        self.reference_id = Reference().create_and_save(
            name="ref", md5checksum="sre", reference_sets=[self.reference_set])

    def test_simple_gene(self):
        g = Gene(name="rpoB",
                 reference=self.reference_seq,
                 start=759807,
                 end=763325)
        assert g.name == "rpoB"
        assert g.forward
        assert g.strand == "forward"
        assert g.seq == "TTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA"
        assert g.prot == "LADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA"
        DB.drop_database('mykrobe-test')

    def test_reverse_gene(self):
        g = Gene(name="gidB",
                 reference=self.reference_seq,
                 start=4407528,
                 end=4408202,
                 forward=False)
        assert g.name == "gidB"
        assert g.forward is False
        assert g.strand == "reverse"
        assert g.seq == "ATGTCTCCGATCGAGCCCGCGGCGTCTGCGATCTTCGGACCGCGGCTTGGCCTTGCTCGGCGGTACGCCGAAGCGTTGGCGGGACCCGGTGTGGAGCGGGGGCTGGTGGGACCCCGCGAAGTCGGTAGGCTATGGGACCGGCATCTACTGAACTGCGCCGTGATCGGTGAGCTCCTCGAACGCGGTGACCGGGTCGTGGATATCGGTAGCGGAGCCGGGTTGCCGGGCGTGCCATTGGCGATAGCGCGGCCGGACCTCCAGGTAGTTCTCCTAGAACCGCTACTGCGCCGCACCGAGTTTCTTCGAGAGATGGTGACAGATCTGGGCGTGGCCGTTGAGATCGTGCGGGGGCGCGCCGAGGAGTCCTGGGTGCAGGACCAATTGGGCGGCAGCGACGCTGCGGTGTCACGGGCGGTGGCCGCGTTGGACAAGTTGACGAAATGGAGCATGCCGTTGATACGGCCGAACGGGCGAATGCTCGCCATCAAAGGCGAGCGGGCTCACGACGAAGTACGGGAGCACCGGCGTGTGATGATCGCATCGGGCGCGGTTGATGTCAGGGTGGTGACATGTGGCGCGAACTATTTGCGTCCGCCCGCGACCGTGGTGTTCGCACGACGTGGAAAGCAGATCGCCCGAGGGTCGGCACGGATGGCGAGTGGAGGGACGGCGTGA"
        assert g.prot == "MSPIEPAASAIFGPRLGLARRYAEALAGPGVERGLVGPREVGRLWDRHLLNCAVIGELLERGDRVVDIGSGAGLPGVPLAIARPDLQVVLLEPLLRRTEFLREMVTDLGVAVEIVRGRAEESWVQDQLGGSDAAVSRAVAALDKLTKWSMPLIRPNGRMLAIKGERAHDEVREHRRVMIASGAVDVRVVTCGANYLRPPATVVFARRGKQIARGSARMASGGTA"
        DB.drop_database('mykrobe-test')

    def test_reverse_gene2(self):
        g = Gene(name="katG",
                 reference=self.reference_seq,
                 start=2153889,
                 end=2156111,
                 forward=False)
        assert g.name == "katG"
        assert g.forward is False
        assert g.strand == "reverse"
        assert g.seq == "GTGCCCGAGCAACACCCACCCATTACAGAAACCACCACCGGAGCCGCTAGCAACGGCTGTCCCGTCGTGGGTCATATGAAATACCCCGTCGAGGGCGGCGGAAACCAGGACTGGTGGCCCAACCGGCTCAATCTGAAGGTACTGCACCAAAACCCGGCCGTCGCTGACCCGATGGGTGCGGCGTTCGACTATGCCGCGGAGGTCGCGACCATCGACGTTGACGCCCTGACGCGGGACATCGAGGAAGTGATGACCACCTCGCAGCCGTGGTGGCCCGCCGACTACGGCCACTACGGGCCGCTGTTTATCCGGATGGCGTGGCACGCTGCCGGCACCTACCGCATCCACGACGGCCGCGGCGGCGCCGGGGGCGGCATGCAGCGGTTCGCGCCGCTTAACAGCTGGCCCGACAACGCCAGCTTGGACAAGGCGCGCCGGCTGCTGTGGCCGGTCAAGAAGAAGTACGGCAAGAAGCTCTCATGGGCGGACCTGATTGTTTTCGCCGGCAACTGCGCGCTGGAATCGATGGGCTTCAAGACGTTCGGGTTCGGCTTCGGCCGGGTCGACCAGTGGGAGCCCGATGAGGTCTATTGGGGCAAGGAAGCCACCTGGCTCGGCGATGAGCGTTACAGCGGTAAGCGGGATCTGGAGAACCCGCTGGCCGCGGTGCAGATGGGGCTGATCTACGTGAACCCGGAGGGGCCGAACGGCAACCCGGACCCCATGGCCGCGGCGGTCGACATTCGCGAGACGTTTCGGCGCATGGCCATGAACGACGTCGAAACAGCGGCGCTGATCGTCGGCGGTCACACTTTCGGTAAGACCCATGGCGCCGGCCCGGCCGATCTGGTCGGCCCCGAACCCGAGGCTGCTCCGCTGGAGCAGATGGGCTTGGGCTGGAAGAGCTCGTATGGCACCGGAACCGGTAAGGACGCGATCACCAGCGGCATCGAGGTCGTATGGACGAACACCCCGACGAAATGGGACAACAGTTTCCTCGAGATCCTGTACGGCTACGAGTGGGAGCTGACGAAGAGCCCTGCTGGCGCTTGGCAATACACCGCCAAGGACGGCGCCGGTGCCGGCACCATCCCGGACCCGTTCGGCGGGCCAGGGCGCTCCCCGACGATGCTGGCCACTGACCTCTCGCTGCGGGTGGATCCGATCTATGAGCGGATCACGCGTCGCTGGCTGGAACACCCCGAGGAATTGGCCGACGAGTTCGCCAAGGCCTGGTACAAGCTGATCCACCGAGACATGGGTCCCGTTGCGAGATACCTTGGGCCGCTGGTCCCCAAGCAGACCCTGCTGTGGCAGGATCCGGTCCCTGCGGTCAGCCACGACCTCGTCGGCGAAGCCGAGATTGCCAGCCTTAAGAGCCAGATCCGGGCATCGGGATTGACTGTCTCACAGCTAGTTTCGACCGCATGGGCGGCGGCGTCGTCGTTCCGTGGTAGCGACAAGCGCGGCGGCGCCAACGGTGGTCGCATCCGCCTGCAGCCACAAGTCGGGTGGGAGGTCAACGACCCCGACGGGGATCTGCGCAAGGTCATTCGCACCCTGGAAGAGATCCAGGAGTCATTCAACTCCGCGGCGCCGGGGAACATCAAAGTGTCCTTCGCCGACCTCGTCGTGCTCGGTGGCTGTGCCGCCATAGAGAAAGCAGCAAAGGCGGCTGGCCACAACATCACGGTGCCCTTCACCCCGGGCCGCACGGATGCGTCGCAGGAACAAACCGACGTGGAATCCTTTGCCGTGCTGGAGCCCAAGGCAGATGGCTTCCGAAACTACCTCGGAAAGGGCAACCCGTTGCCGGCCGAGTACATGCTGCTCGACAAGGCGAACCTGCTTACGCTCAGTGCCCCTGAGATGACGGTGCTGGTAGGTGGCCTGCGCGTCCTCGGCGCAAACTACAAGCGCTTACCGCTGGGCGTGTTCACCGAGGCCTCCGAGTCACTGACCAACGACTTCTTCGTGAACCTGCTCGACATGGGTATCACCTGGGAGCCCTCGCCAGCAGATGACGGGACCTACCAGGGCAAGGATGGCAGTGGCAAGGTGAAGTGGACCGGCAGCCGCGTGGACCTGGTCTTCGGGTCCAACTCGGAGTTGCGGGCGCTTGTCGAGGTCTATGGCGCCGATGACGCGCAGCCGAAGTTCGTGCAGGACTTCGTCGCTGCCTGGGACAAGGTGATGAACCTCGACAGGTTCGACGTGCGCTGA"
        assert g.prot == "VPEQHPPITETTTGAASNGCPVVGHMKYPVEGGGNQDWWPNRLNLKVLHQNPAVADPMGAAFDYAAEVATIDVDALTRDIEEVMTTSQPWWPADYGHYGPLFIRMAWHAAGTYRIHDGRGGAGGGMQRFAPLNSWPDNASLDKARRLLWPVKKKYGKKLSWADLIVFAGNCALESMGFKTFGFGFGRVDQWEPDEVYWGKEATWLGDERYSGKRDLENPLAAVQMGLIYVNPEGPNGNPDPMAAAVDIRETFRRMAMNDVETAALIVGGHTFGKTHGAGPADLVGPEPEAAPLEQMGLGWKSSYGTGTGKDAITSGIEVVWTNTPTKWDNSFLEILYGYEWELTKSPAGAWQYTAKDGAGAGTIPDPFGGPGRSPTMLATDLSLRVDPIYERITRRWLEHPEELADEFAKAWYKLIHRDMGPVARYLGPLVPKQTLLWQDPVPAVSHDLVGEAEIASLKSQIRASGLTVSQLVSTAWAAASSFRGSDKRGGANGGRIRLQPQVGWEVNDPDGDLRKVIRTLEEIQESFNSAAPGNIKVSFADLVVLGGCAAIEKAAKAAGHNITVPFTPGRTDASQEQTDVESFAVLEPKADGFRNYLGKGNPLPAEYMLLDKANLLTLSAPEMTVLVGGLRVLGANYKRLPLGVFTEASESLTNDFFVNLLDMGITWEPSPADDGTYQGKDGSGKVKWTGSRVDLVFGSNSELRALVEVYGADDAQPKFVQDFVAAWDKVMNLDRFDVR"
        DB.drop_database('mykrobe-test')

    def test_get_codon(self):
        g = Gene(name="rpoB",
                 reference=self.reference_seq,
                 start=759807,
                 end=763325)
        with pytest.raises(ValueError):
            g.get_codon(1173)
        assert g.get_codon(2) == "GCA"
        assert g.get_codon(3) == "GAT"
        assert g.get_reference_position(1) == 759807
        assert g.seq[0] == self.reference_seq[759806]
        assert g.get_reference_position(-1) == 759806
        DB.drop_database('mykrobe-test')

    def test_get_codon_reverse(self):
        g = Gene(name="gidB",
                 reference=self.reference_seq,
                 start=4407528,
                 end=4408202,
                 forward=False)
        with pytest.raises(ValueError):
            g.get_codon(225)
        assert g.get_codon(2) == "TCT"
        assert g.get_codon(3) == "CCG"
        assert g.get_reference_position(1) == 4408202
        assert g.get_reference_position(2) == 4408201
        assert g.get_reference_position(-1) == 4408203
        assert g.get_reference_position(-2) == 4408204
        DB.drop_database('mykrobe-test')

    def test_gene_muts(self):
        self.gm = GeneAminoAcidChangeToDNAVariants(
            reference="src/mykrobe/data/NC_000962.3.fasta",
            genbank="src/mykrobe/data/NC_000962.3.gb")
        assert self.gm.get_alts("K") == ['AAA', 'AAG']
        # GAT -> ['GCA', 'GCT', 'GCC', 'GCG'], positions 759813,14,15
        assert sorted(self.gm.get_variant_names("rpoB", "D3A")) == sorted(
            ['GAT759813GCA', 'GAT759813GCT', 'GAT759813GCC', 'GAT759813GCG'])
        # GAT -> ['GCA', 'GCT', 'GCC', 'GCG'], positions 759813,14,15
        assert sorted(self.gm.get_variant_names("rpoB", "D3X")) == sorted([
            'GAT759813GCA', 'GAT759813GCT', 'GAT759813GCC', 'GAT759813GCG',
            'GAT759813TGT', 'GAT759813TGC', 'GAT759813GAA', 'GAT759813GAG',
            'GAT759813GGA', 'GAT759813GGT', 'GAT759813GGC', 'GAT759813GGG',
            'GAT759813TTT', 'GAT759813TTC', 'GAT759813ATA', 'GAT759813ATT',
            'GAT759813ATC', 'GAT759813CAT', 'GAT759813CAC', 'GAT759813AAA',
            'GAT759813AAG', 'GAT759813ATG', 'GAT759813TTA', 'GAT759813TTG',
            'GAT759813CTA', 'GAT759813CTT', 'GAT759813CTC', 'GAT759813CTG',
            'GAT759813AAT', 'GAT759813AAC', 'GAT759813CAA', 'GAT759813CAG',
            'GAT759813CCA', 'GAT759813CCT', 'GAT759813CCC', 'GAT759813CCG',
            'GAT759813AGT', 'GAT759813AGC', 'GAT759813TCA', 'GAT759813TCT',
            'GAT759813TCC', 'GAT759813TCG', 'GAT759813AGA', 'GAT759813AGG',
            'GAT759813CGA', 'GAT759813CGT', 'GAT759813CGC', 'GAT759813CGG',
            'GAT759813ACA', 'GAT759813ACT', 'GAT759813ACC', 'GAT759813ACG',
            'GAT759813TGG', 'GAT759813GTA', 'GAT759813GTT', 'GAT759813GTC',
            'GAT759813GTG', 'GAT759813TAT', 'GAT759813TAC'
        ])
        DB.drop_database('mykrobe-test')

    def test_gene_muts2(self):
        self.gm = GeneAminoAcidChangeToDNAVariants(
            reference="src/mykrobe/data/NC_000962.3.fasta",
            genbank="src/mykrobe/data/NC_000962.3.gb")
        assert self.gm.get_alts("K") == ['AAA', 'AAG']
        # AGC -> ['CTT', 'CTC', 'CTA', 'CTG']
        #   # GAG -> ['GCA', 'GCT', 'GCC', 'GCG']
        # RC : CTC -> ['TGC',...] position2156103
        assert sorted(self.gm.get_variant_names("katG", "E3A")) == sorted([
            'CTC2156103TGC', 'CTC2156103AGC', 'CTC2156103GGC', 'CTC2156103CGC'
        ])
        DB.drop_database('mykrobe-test')

    def test_make_variant_panel1(self):
        ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta")
        gene = self.gm.get_gene("rpoB")
        for var in self.gm.get_variant_names("rpoB", "D3A"):
            ref, start, alt = split_var_name(var)
            v = Variant.create(variant_sets=self.variant_sets,
                               reference=self.reference_id,
                               reference_bases=ref,
                               start=start,
                               alternate_bases=[alt])
            panel = ag.create(v)
            for alt in panel.alts:
                seq = copy.copy(str(gene.seq))
                assert Seq(seq).translate()[2] == "D"
                seq = seq.replace(panel.refs[0][25:], alt[24:])
                assert seq != str(gene.seq)
                assert Seq(seq).translate()[2] == "A"
        DB.drop_database('mykrobe-test')

    def test_make_variant_panel2(self):
        ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta")
        gene = self.gm.get_gene("katG")
        for var in self.gm.get_variant_names("katG", "E3A"):
            ref, start, alt = split_var_name(var)
            v = Variant.create(variant_sets=self.variant_sets,
                               reference=self.reference_id,
                               reference_bases=ref,
                               start=start,
                               alternate_bases=[alt])
            panel = ag.create(v)
            for alt in panel.alts:
                seq = copy.copy(str(gene.seq.reverse_complement()))
                seq = seq.replace(panel.refs[0][:39],
                                  alt[:39 + len(alt) - len(panel.refs[0])])
                assert seq != str(gene.seq)
                assert Seq(seq).reverse_complement().translate()[2] == "A"
        DB.drop_database('mykrobe-test')

    def test_make_variant_panel3(self):
        ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta")
        gene = self.gm.get_gene("katG")
        for var in self.gm.get_variant_names("katG", "S315L"):
            ref, start, alt = split_var_name(var)
            v = Variant.create(variant_sets=self.variant_sets,
                               reference=self.reference_id,
                               reference_bases=ref,
                               start=start,
                               alternate_bases=[alt])
            panel = ag.create(v)
            for alt in panel.alts:
                seq = copy.copy(str(gene.seq.reverse_complement()))
                seq = seq.replace(panel.refs[0], alt)
                assert seq != str(gene.seq)
                assert Seq(seq).reverse_complement().translate()[314] == "L"
        DB.drop_database('mykrobe-test')

    def test_make_variant_panel4(self):
        ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta")
        gene = self.gm.get_gene("katG")
        for var in self.gm.get_variant_names("katG", "W90R"):
            ref, start, alt = split_var_name(var)
            v = Variant.create(variant_sets=self.variant_sets,
                               reference=self.reference_id,
                               reference_bases=ref,
                               start=start,
                               alternate_bases=[alt])
            panel = ag.create(v)
            for alt in panel.alts:
                seq = copy.copy(str(gene.seq.reverse_complement()))
                seq = seq.replace(panel.refs[0], alt)
                assert seq != str(gene.seq)
                assert Seq(seq).reverse_complement().translate()[89] == "R"
        DB.drop_database('mykrobe-test')

    def test_make_variant_panel5(self):
        ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta")
        gene = self.gm.get_gene("gyrA")
        for var in self.gm.get_variant_names("gyrA", "D94X"):
            ref, start, alt = split_var_name(var)
            v = Variant.create(variant_sets=self.variant_sets,
                               reference=self.reference_id,
                               reference_bases=ref,
                               start=start,
                               alternate_bases=[alt])
            panel = ag.create(v)
            for alt in panel.alts:
                seq = copy.copy(str(gene.seq))
                seq = seq.replace(panel.refs[0], alt)
                assert Seq(seq).translate()[93] != "D"
        DB.drop_database('mykrobe-test')

    def test_make_variant_panel6(self):
        ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta")
        gene = self.gm.get_gene("pncA")
        variants = list(
            self.gm.get_variant_names("pncA",
                                      "CAG28TAA",
                                      protein_coding_var=False))
        assert len(variants) == 1
        var = variants[0]
        ref, start, alt = split_var_name(var)
        assert ref == 'CTG'
        assert start == 2289212
        assert alt == 'TTA'
        v = Variant.create(variant_sets=self.variant_sets,
                           reference=self.reference_id,
                           reference_bases=ref,
                           start=start,
                           alternate_bases=[alt])
        panel = ag.create(v)
        assert len(panel.alts) == 1
        alt = panel.alts[0]
        # the panel ref/alt seqs go past the end of the gene,
        # so can't comparie against gene sequence. Need to get
        # subseq from the reference seq
        panel_ref_start = self.reference_seq.find(panel.refs[0])
        assert panel_ref_start < start < panel_ref_start + len(panel.refs[0])
        seq = str(self.reference_seq[panel_ref_start:panel_ref_start +
                                     len(panel.refs[0])])
        assert seq == panel.refs[0]
        assert alt == seq[:30] + 'TTA' + seq[33:]
        DB.drop_database('mykrobe-test')

    def test_make_variant_panel7(self):
        # Test DNA change upstream of a gene on the reverse
        # strand. The variant G-10A is in "gene space", ie
        # 10 bases upstream of eis is the nucleotide G on the
        # reverse strand. That position is 2715342 in the genome,
        # and is C on the forwards strand.
        # Here's a diagram:
        #             | <- This C is at -10 in "gene space", so variant G-10A has ref=G
        #             |    ref coord is 2715342, and variant in "ref space" is C2715342T
        # CACAGAATCCGACTGTGGCATATGCCGC
        #   |
        #   | <- C = last nucleotide of gene, at 2715332
        ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta")
        gene = self.gm.get_gene("eis")
        variants = list(
            self.gm.get_variant_names("eis", "G-10A",
                                      protein_coding_var=False))
        assert len(variants) == 1
        var = variants[0]
        ref, start, alt = split_var_name(var)
        assert ref == 'C'
        assert start == 2715342
        assert alt == 'T'
        v = Variant.create(variant_sets=self.variant_sets,
                           reference=self.reference_id,
                           reference_bases=ref,
                           start=start,
                           alternate_bases=[alt])
        panel = ag.create(v)
        assert len(panel.alts) == 1
        alt = panel.alts[0]
        # the panel ref/alt seqs go past the end of the gene,
        # so can't comparie against gene sequence. Need to get
        # subseq from the reference seq
        panel_ref_start = self.reference_seq.find(panel.refs[0])
        assert panel_ref_start < start < panel_ref_start + len(panel.refs[0])
        seq = str(self.reference_seq[panel_ref_start:panel_ref_start +
                                     len(panel.refs[0])])
        assert seq == panel.refs[0]
        assert alt == seq[:30] + 'T' + seq[31:]
        DB.drop_database('mykrobe-test')

    def test_make_variant_panel8(self):
        ag = AlleleGenerator("src/mykrobe/data/NC_000962.3.fasta")
        gene = self.gm.get_gene("eis")
        variants = list(
            self.gm.get_variant_names("eis", "TG-1T",
                                      protein_coding_var=False))
        assert len(variants) == 1
        var = variants[0]
        ref, start, alt = split_var_name(var)
        assert ref == 'CA'
        assert start == 2715332
        assert alt == 'A'
        v = Variant.create(variant_sets=self.variant_sets,
                           reference=self.reference_id,
                           reference_bases=ref,
                           start=start,
                           alternate_bases=[alt])
        panel = ag.create(v)
        assert len(panel.alts) == 1
        alt = panel.alts[0]
        # the panel ref/alt seqs go past the end of the gene,
        # so can't comparie against gene sequence. Need to get
        # subseq from the reference seq
        panel_ref_start = self.reference_seq.find(panel.refs[0])
        assert panel_ref_start < start < panel_ref_start + len(panel.refs[0])
        seq = str(self.reference_seq[panel_ref_start:panel_ref_start +
                                     len(panel.refs[0])])
        assert seq == panel.refs[0]
        print(alt, seq[:31] + seq[31:])
        assert alt == seq[:30] + seq[31:]
        DB.drop_database('mykrobe-test')
コード例 #3
0
def run(parser, args):
    DB = connect('mykrobe-%s' % (args.db_name))
    if DB is not None:
        try:
            Variant.objects()
            logging.info(
                "Connected to mykrobe-%s" % (args.db_name))
        except (ServerSelectionTimeoutError):
            DB = None
            logging.warning(
                "Could not connect to database. Continuing without using genetic backgrounds")
    mutations = []
    reference = os.path.basename(args.reference_filepath).split('.fa')[0]
    if args.vcf:
        run_make_probes_from_vcf_file(args)
    elif args.genbank:
        aa2dna = GeneAminoAcidChangeToDNAVariants(
            args.reference_filepath,
            args.genbank)
        if args.text_file:
            with open(args.text_file, 'r') as infile:
                reader = csv.reader(infile, delimiter="\t")
                for row in reader:
                    gene, mutation_string, alphabet = row
                    if alphabet == "DNA":
                        protein_coding_var = False
                    else:
                        protein_coding_var = True
                    for var_name in aa2dna.get_variant_names(
                            gene, mutation_string, protein_coding_var):
                        mutation = Mutation(reference=reference,
                                            var_name=var_name,
                                            gene=aa2dna.get_gene(gene),
                                            mut=mutation_string)
                        mutations.append(mutation)
        else:
            for variant in args.variants:

                gene, mutation = variant.split("_")
                for var_name in aa2dna.get_variant_names(gene, mutation):
                    mutations.append(
                        Mutation(reference=reference,
                                 var_name=var_name,
                                 gene=gene,
                                 mut=mutation))
    else:
        if args.text_file:
            with open(args.text_file, 'r') as infile:
                reader = csv.reader(infile, delimiter="\t")
                for row in reader:
                    gene_name, pos, ref, alt, alphabet = row
                    if gene_name == "ref":
                        mutations.append(
                            Mutation(
                                reference=reference,
                                var_name="".join([ref, pos, alt])))
                    else:
                        mutations.append(
                            Mutation(
                                reference=reference,
                                var_name=row[0]))
        else:
            mutations.extend(Mutation(reference=reference, var_name=v)
                             for v in args.variants)
    al = AlleleGenerator(
        reference_filepath=args.reference_filepath,
        kmer=args.kmer)
    for enum, mut in enumerate(mutations):
        if enum % 100 == 0:
            logger.info(
                "%i of %i - %f%%" % (enum, len(mutations), round(100*enum/len(mutations), 2)))
        variant_panel = make_variant_probe(
            al, mut.variant, args.kmer, DB=DB, no_backgrounds=args.no_backgrounds)
        if variant_panel is not None:
            for i, ref in enumerate(variant_panel.refs):
                try:
                    gene_name = mut.gene.name
                except AttributeError:
                    gene_name = "NA"

                sys.stdout.write(
                    ">ref-%s?var_name=%s&num_alts=%i&ref=%s&enum=%i&gene=%s&mut=%s\n" %
                    (mut.mut, mut.variant.var_name, len(
                        variant_panel.alts), mut.reference, i, gene_name, mut.mut))
                sys.stdout.write("%s\n" % ref)

            for i, a in enumerate(variant_panel.alts):
                sys.stdout.write(">alt-%s?var_name=%s&enum=%i&gene=%s&mut=%s\n" %
                                 (mut.mut, mut.variant.var_name, i, gene_name, mut.mut))

                sys.stdout.write("%s\n" % a)
        else:
            logging.warning(
                "All variants failed for %s_%s - %s" %
                (mut.gene, mut.mut, mut.variant))
コード例 #4
0
def run(parser, args):
    # There's no need to try to connect to database if we're not doing backgrounds
    if args.no_backgrounds:
        logger.info(
            "Not connecting to database, because --no-backgrounds option used")
        DB = None
    else:
        DB = connect("%s-%s" % (DB_PREFIX, args.db_name))

    if DB is not None:
        try:
            Variant.objects()
            logger.info("Connected to %s-%s" % (DB_PREFIX, args.db_name))
        except (ServerSelectionTimeoutError):
            DB = None
            logger.warning(
                "Could not connect to database. Continuing without using genetic backgrounds"
            )
    mutations = []
    lineages = set()
    reference = os.path.basename(args.reference_filepath).split(".fa")[0]
    if args.vcf:
        run_make_probes_from_vcf_file(args)
    elif args.genbank:
        aa2dna = GeneAminoAcidChangeToDNAVariants(args.reference_filepath,
                                                  args.genbank)
        if args.text_file:
            with open(args.text_file, "r") as infile:
                reader = csv.reader(infile, delimiter="\t")
                for row in reader:
                    gene, mutation_string, alphabet = row
                    if alphabet == "DNA":
                        protein_coding_var = False
                    else:
                        protein_coding_var = True
                    for var_name in aa2dna.get_variant_names(
                            gene, mutation_string, protein_coding_var):
                        mutation = Mutation(
                            reference=reference,
                            var_name=var_name,
                            gene=aa2dna.get_gene(gene),
                            mut=mutation_string,
                            protein_coding_var=protein_coding_var,
                        )
                        mutations.append(mutation)
        else:
            for variant in args.variants:

                gene, mutation = variant.split("_")
                for var_name in aa2dna.get_variant_names(gene, mutation):
                    mutations.append(
                        Mutation(
                            reference=reference,
                            var_name=var_name,
                            gene=gene,
                            mut=mutation,
                        ))
    else:
        if args.text_file:
            mutations, lineages = load_dna_vars_txt_file(
                args.text_file, reference)
            if args.lineage:
                with open(args.lineage, "w") as f:
                    json.dump(lineages, f, sort_keys=True, indent=2)
        else:
            mutations.extend(
                Mutation(reference=reference, var_name=v)
                for v in args.variants)

    al = AlleleGenerator(reference_filepath=args.reference_filepath,
                         kmer=args.kmer)
    for enum, mut in enumerate(mutations):
        if enum % 100 == 0:
            logger.info(
                "%i of %i - %f%%" %
                (enum, len(mutations), round(100 * enum / len(mutations), 2)))
        variant_panel = make_variant_probe(al,
                                           mut.variant,
                                           args.kmer,
                                           DB=DB,
                                           no_backgrounds=args.no_backgrounds)
        if variant_panel is not None:
            for i, ref in enumerate(variant_panel.refs):
                try:
                    gene_name = mut.gene.name
                except AttributeError:
                    gene_name = "NA"

                sys.stdout.write(
                    ">ref-%s?var_name=%s&num_alts=%i&ref=%s&enum=%i&gene=%s&mut=%s\n"
                    % (
                        mut.mutation_output_name,
                        mut.variant.var_name,
                        len(variant_panel.alts),
                        mut.reference,
                        i,
                        gene_name,
                        mut.mutation_output_name,
                    ))
                sys.stdout.write("%s\n" % ref)

            for i, a in enumerate(variant_panel.alts):
                sys.stdout.write(
                    ">alt-%s?var_name=%s&enum=%i&gene=%s&mut=%s\n" % (
                        mut.mutation_output_name,
                        mut.variant.var_name,
                        i,
                        gene_name,
                        mut.mutation_output_name,
                    ))

                sys.stdout.write("%s\n" % a)
        else:
            logger.warning("All variants failed for %s_%s - %s" %
                           (mut.gene, mut.mutation_output_name, mut.variant))