예제 #1
0
    def test_cds_function(self):
        cds = CDSFeature(FeatureLocation(1, 5, 1), locus_tag="foo")
        # default value
        assert cds.gene_functions.get_classification() == GeneFunction.OTHER
        assert cds.gene_function == GeneFunction.OTHER
        # check bad values can't be assigned
        with self.assertRaises(AssertionError):
            cds.gene_functions.add("other", "a", "b")
        with self.assertRaises(AttributeError):
            cds.gene_functions = 0

        cds.gene_functions.add(GeneFunction.ADDITIONAL, "first_tool", "dummy")
        assert cds.gene_functions.get_classification() == GeneFunction.ADDITIONAL
        assert cds.gene_function == GeneFunction.ADDITIONAL
        # conflicting, so back to OTHER
        cds.gene_functions.add(GeneFunction.TRANSPORT, "other_tool", "dummy")
        assert cds.gene_functions.get_classification() == GeneFunction.OTHER
        assert cds.gene_function == GeneFunction.OTHER
        # but smcogs overrides that
        cds.gene_functions.add(GeneFunction.REGULATORY, "smcogs", "dummy")
        assert cds.gene_functions.get_classification() == GeneFunction.REGULATORY
        # and cluster definition overrides even that
        cds.gene_functions.add(GeneFunction.CORE, "cluster_definition", "dummy")
        assert cds.gene_functions.get_classification() == GeneFunction.CORE

        # and that we still have tracked these
        smcogs = cds.gene_functions.get_by_tool("smcogs")
        assert len(smcogs) == 1
        assert smcogs[0].function == GeneFunction.REGULATORY

        adds = cds.gene_functions.get_by_function(GeneFunction.ADDITIONAL)
        assert len(adds) == 1
        assert adds[0].tool == "first_tool"
예제 #2
0
 def setUp(self):
     self.magic_split = Seq("ATGGCAxxxxxxGGTxxxxxxATTTGT")
     self.magic = Seq("ATGGCAGGTATTTGT")
     self.translation = "MAGIC"
     self.sub_locations = [FeatureLocation(0, 6, strand=1),
                           FeatureLocation(12, 15, strand=1),
                           FeatureLocation(21, 27, strand=1)]
     self.location = CompoundLocation(self.sub_locations)
     self.cds = CDSFeature(self.location, locus_tag="compound")
예제 #3
0
 def test_simple_location_forward_partial(self):
     cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple")
     for start, end in [(1, 5), (0, 3), (2, 3), (1, 4)]:
         print("testing", start, end)
         new = cds.get_sub_location_from_protein_coordinates(start, end)
         print(new)
         extracted = new.extract(self.magic)
         assert extracted == self.magic[start * 3: end * 3]
         assert extracted.translate() == self.translation[start:end]
예제 #4
0
 def test_compound_location_reverse_full(self):
     self.reverse_strand()
     for direction in [1, -1]:
         print("sub locations sorted in direction:", direction)
         cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound")
         new = cds.get_sub_location_from_protein_coordinates(0, 5)
         assert isinstance(new, CompoundLocation)
         assert len(new.parts) == 3
         print(list(map(str, cds.location.parts)))
         print(list(map(str, new.parts)))
         assert len(new) == len(cds.location)
         assert new.extract(self.magic_split).translate() == self.translation[0:5]
예제 #5
0
    def setUp(self):
        self.config = build_config([
            "--cf-create-clusters", "--cf-mean-threshold", "0.6",
            "--cf-min-cds", "5", "--cf-min-pfams", "5"
        ],
                                   modules=[clusterfinder],
                                   isolated=True)
        update_config({"enabled_cluster_types": []})

        self.record = DummyRecord(seq=Seq("A" * 2000))
        for start, end, probability, pfam_id in [(10, 20, 0.1, 'FAKE007'),
                                                 (30, 40, 0.3, 'PF00106'),
                                                 (50, 60, 0.4, 'PF00107'),
                                                 (60, 70, 0.7, 'PF00109'),
                                                 (70, 80, 0.98, 'PF08484'),
                                                 (90, 100, 0.8, 'PF02401'),
                                                 (100, 110, 0.32, 'PF04369'),
                                                 (110, 120, 1.0, 'PF00128'),
                                                 (130, 140, 0.2, 'FAKE234'),
                                                 (500, 505, None, 'FAKE505'),
                                                 (1010, 1020, 0.1, 'FAKE007'),
                                                 (1030, 1040, 0.3, 'PF00106'),
                                                 (1050, 1060, 0.4, 'PF00107'),
                                                 (1060, 1070, 0.7, 'PF00109'),
                                                 (1070, 1080, 0.98, 'PF08484'),
                                                 (1090, 1100, 0.8, 'PF02401'),
                                                 (1100, 1110, 0.32, 'PF04369'),
                                                 (1110, 1120, 1.0, 'PF00128')]:
            location = FeatureLocation(start, end)
            self.record.add_cds_feature(
                CDSFeature(location, locus_tag=str(start)))
            pfam = PFAMDomain(location, "dummy_description")
            pfam.db_xref.append(pfam_id)
            pfam.probability = probability
            self.record.add_pfam_domain(pfam)
예제 #6
0
    def test_compound_location_reverse_multiple(self):
        self.reverse_strand()
        for direction in [1, -1]:
            print("sub locations sorted in direction:", direction)
            cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound")

            new = cds.get_sub_location_from_protein_coordinates(2, 4)
            assert isinstance(new, CompoundLocation)
            print(list(map(str, cds.location.parts)))
            print(list(map(str, new.parts)))
            assert len(new.parts) == 2
            assert len(new) == 6
            assert new.parts[0].start == 12
            assert new.parts[0].end == 15
            assert new.parts[1].start == 3
            assert new.parts[1].end == 6
            assert new.extract(self.magic_split).translate() == self.translation[2:4]
예제 #7
0
 def test_classification_with_colon(self):
     # since SMCOG id and description are stored in a string separated by :,
     # ensure that descriptions containing : are properly handled
     cds = CDSFeature(FeatureLocation(0, 100),
                      locus_tag="test",
                      translation="AAA")
     record = helpers.DummyRecord(features=[cds], seq="A" * 100)
     record.add_cluster(helpers.DummyCluster(0, 100))
     results = SMCOGResults(record.id)
     results.best_hits[cds.get_name()] = HMMResult(
         "SMCOG1212:sodium:dicarboxylate_symporter", 0, 100, 2.3e-126, 416)
     results.add_to_record(record)
     gene_functions = cds.gene_functions.get_by_tool("smcogs")
     assert len(gene_functions) == 1
     assert str(gene_functions[0]).startswith(
         "transport (smcogs) SMCOG1212:sodium:dicarboxylate_symporter"
         " (Score: 416; E-value: 2.3e-126)")
예제 #8
0
    def test_compound_location_reverse_single(self):
        self.reverse_strand()
        for direction in [1, -1]:
            print("sub locations sorted in direction:", direction)
            cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound")

            new = cds.get_sub_location_from_protein_coordinates(0, 2)
            assert isinstance(new, FeatureLocation)
            assert len(new) == 6
            assert new.start == 21
            assert new.end == 27
            assert new.extract(self.magic_split).translate() == self.translation[0:2]

            new = cds.get_sub_location_from_protein_coordinates(2, 3)
            assert isinstance(new, FeatureLocation)
            assert len(new) == 3
            assert new.start == 12
            assert new.end == 15
            assert new.extract(self.magic_split).translate() == self.translation[2:3]
예제 #9
0
 def test_cds_function_conversion(self):
     cds = CDSFeature(FeatureLocation(1, 5, 1), locus_tag="foo")
     assert cds.gene_function == GeneFunction.OTHER
     assert CDSFeature.from_biopython(cds.to_biopython()[0]).gene_function == GeneFunction.OTHER
     cds.gene_functions.add(GeneFunction.ADDITIONAL, "tool", "desc")
     assert cds.gene_function == GeneFunction.ADDITIONAL
     assert CDSFeature.from_biopython(cds.to_biopython()[0]).gene_function == GeneFunction.ADDITIONAL
예제 #10
0
 def test_membership(self):
     location = FeatureLocation(0, 1, strand=1)
     # Features don't have locus tags
     with self.assertRaises(AttributeError):
         Feature(location, feature_type="none").locus_tag = "something"
     # CDSFeatures don't have an 'other_value'
     with self.assertRaises(AttributeError):
         CDSFeature(location, translation="none", gene="a").other_value = 1
     cluster = Cluster(location, 0, 0, products=["a", "b"])
     assert cluster.products == ("a", "b")
     # Clusters have products, not product
     with self.assertRaises(AttributeError):
         cluster.product = ["c", "d"]
예제 #11
0
def generate_motif_features(record: Record, feature: CDSFeature,
                            motifs: List[HMMResult]) -> List[CDSMotif]:
    """ Convert a list of HMMResult to a list of CDSMotif features """
    # use a locus tag if one exists
    locus_tag = feature.get_name()
    if feature.locus_tag:
        locus_tag = feature.locus_tag
    # grab the translation table if it's there
    if feature.transl_table:
        transl_table = feature.transl_table
    else:
        transl_table = 1

    motif_features = []
    for i, motif in enumerate(motifs):
        i += 1  # user facing, so 1-indexed
        loc = feature.get_sub_location_from_protein_coordinates(
            motif.query_start, motif.query_end)
        new_motif = CDSMotif(loc)
        new_motif.label = motif.hit_id
        new_motif.motif = motif.hit_id  # TODO: why both label AND motif?
        new_motif.domain_id = 'nrpspksmotif_{}_{:04d}'.format(locus_tag, i)
        new_motif.evalue = motif.evalue
        new_motif.score = motif.bitscore
        new_motif.tool = "pksnrpsmotif"
        new_motif.detection = "hmmscan"
        new_motif.database = "abmotifs"
        new_motif.locus_tag = locus_tag

        new_motif.translation = str(
            new_motif.extract(record.seq).translate(table=transl_table))
        new_motif.notes.append(
            "NRPS/PKS Motif: %s (e-value: %s, bit-score: %s)" %
            (motif.hit_id, motif.evalue,
             motif.bitscore))  # TODO move to CDSMotif

        motif_features.append(new_motif)
    return motif_features
예제 #12
0
 def test_required_identifiers(self):
     with self.assertRaises(ValueError):
         CDSFeature(FeatureLocation(1, 5, 1))
     assert CDSFeature(FeatureLocation(1, 5, 1), locus_tag="foo")
     assert CDSFeature(FeatureLocation(1, 5, 1), protein_id="foo")
     assert CDSFeature(FeatureLocation(1, 5, 1), gene="foo")
예제 #13
0
 def test_complicated(self):
     parts = [FeatureLocation(121124, 122061, 1), FeatureLocation(122339, 122383, 1),
              FeatureLocation(122559, 122666, 1), FeatureLocation(122712, 122874, 1),
              FeatureLocation(123060, 123337, 1), FeatureLocation(123481, 123749, 1),
              FeatureLocation(123809, 124032, 1), FeatureLocation(124091, 124193, 1),
              FeatureLocation(124236, 124401, 1), FeatureLocation(124684, 124724, 1)]
     location = CompoundLocation(parts, operator="join")
     cds = CDSFeature(location, locus_tag="complicated")
     seq = ("ATGAGCCCTCGTCTAGACTACAATGAAGGATACGATTCCGAAGACGAGGAGATCCCCCGTTACGTACACCAT"
            "TCTAGAGGAAAGAGTCATAGATCCGTGAGGACGTCAGGTCGCTCACGCACGTTGGATTACGACGGGGATGAT"
            "GAAGCTAGTGACCACGCTGCCCCCTCCGGGATTGATCGGGACGCTCGAGCCTGTCCAACATCTCGCAGATAT"
            "ACTGATGACTGCCTTGAGACACATAAATTTCGAGGTGCCCGCTCCTCTCGCTCCCGTGGACGAACCGATGAT"
            "AACAAGGTTTTGTACTACACCAAGTATCGCAGCCCGGCTAAGGACTTGCCTATCGAGCGTGATCCCGAGGGT"
            "ATTAATTTATTCAAGGTCCGACAGCACACACGGCCAAGTGACGCTCATGTGCCCAGTGGATACCGTGAGCCC"
            "TACGAAGTCAAGGTCGACGAGTATGAGGATGATCATCCCCGTACATGCACTAGCCGCCGTGACTCTAGACAG"
            "CCGAAAGTCTACAAGGTCCGGGTTGATGAGTACGAGGATAACCTCCCTGCACGCTCTCACACTGACTTTCGC"
            "GAGTCTCCACGGTCTGAAAGATGCTCTAGCCGCTACACCGAGGACTCGAAGCCTGGGGAGCTTCCTCCCCGC"
            "TCAGGGCCCTGTCGGTCCAGCAGGCCTTCTCCGGTCGATGAGGACGTCGAGTATGAGATCCGTGAGCCCCGA"
            "GGGCATCGCTCCAGTCGACACTCTACAGATGTTGACTTTCAGCCAGTAGAACAACATCCTCGCTTTGGACAA"
            "CGTGGACTCAGCAGACCTTCGCGGGTTGATGAGGAAGTCGATTATGAGATCCGTGAGCCCCGTGGCAATCGT"
            "GTCAGTCACGCTGCTCATGGTGACAGCCCCTGTCAGGACCAAAGCTCCAGGCATATCGGCATTCAATTGTGG"
            "AGTACGCGCGGACCCCGGGCGGCTGGCCGTGGCCGGGGTCCTGATGAGTCTGACGATGTTGAGCCCTAGGCA"
            "GGGAATTGCCGTAATGCTCTTCAAACTGTATAGCAAGCTCAGCATCAATTCTTTAACTGGCAGGCGCTCTGC"
            "TCGCGCGTTTCTCTCTTGGGGTGGTTGGTTTGACTGTAGATTTCCTCTTTCAAGGCTTCTAGATACACCTTT"
            "GGAAGATAGCAACGCTATGCAAGATATTTTTGATAATTCAAATCCTTTTTACACATGGAATAGCTGGTGTTC"
            "CTGTTTTATCTAGGCAATTGACCCACGCCATCTCGGTAGGTACGGTAAAAGCAAGCCGTAATCTCGTATGGC"
            "TTCATCCTTAGCATCGTATAGATCTCCACTCGGGACTCGGCCAGGGATCTTCCATCAATCAACGTGAAGAAG"
            "TCCAGCACCCCGCTGAATCATAATATCCTACCGATTCTGCTCTCTTCACCTCTAGATACCCCTCTAGACTCC"
            "TGTCAACATGTTCCGTACAGTCGAAGACCGCCCGACCCCAAAAGAGGTATATAACTGGCGGCTGTACACCGA"
            "GGCCACCATCATTGCCACTGGTACACTCTTGTGAGTAGGTGCTGTTGTAACGAAAAACATCCAACTGATCCG"
            "CCAGGTTCGGCTATGACTCGGCTTTTGTGGGAACTACCATTGCCCGCCAAAGCTTCGTTGATGCCTTCAACA"
            "TCGTCGAGTCGGAGGCGGCGGATATTTCAAGCAATATCACGTCAACCTTTCAGGCCGGCGCATTTTTCGGCG"
            "CCATCTTCTGCTTCTTGCCTGAGTGAAGCCGTTAGAGACGGTCTCACTGGCTAACCGGACCAAGTGACCGAC"
            "AAAATTGGGCGTAAATGGGCCCTTCAGGCAAACACACTGCTGTTTCTTATTGGCGCGATTGTGATGACGGCT"
            "GCAACACATCACCTTTCCTATATATGTAAGTCATATCCCCGTAGTAGTCAAGGTTGTTAACTAGAGCAGATG"
            "CTGGACGAGCTCTCACCGGCATCGCATGCGGCGCTATCACCGCGACCGTCCCCAGCTATATTGCCGAGCTGT"
            "CAATCGTGTCGATCCGGGGCTTCCTCACCGGGTTCTTCGAAGTCGCATACCAGATTGGTAGCTTGGTTGGAT"
            "TCTGGATCAACTATGGCATTAACGAGAACATGGACAACTCCTCGGCCGCAAGCTGGAGAGTGCCTATGGCAG"
            "TCCAGATCATCCCCGCAGGAGTCCTTTTCATTGGTGGCTTTTCCTCCATGAGAGTCCTCTCTGGCTGATGCG"
            "AAAAGACAGTGAGGATGCCGCGACGGCTGCCCTGGAGGCGTTGAGGAAACTGCCACGGTCTCATCAATGTAA"
            "TCTCCCACCAAGACTCAGGACATAGTCCCATGCTGACTATTTTAGATGTCCAGGAAGACATCGAGATGAACC"
            "GCACCAGGCTGCTGGAGGAAGCTCGGATCGCCGAGAAGTACGGACAAGGTTGGTTGGCATATATCCGAGGCG"
            "CACTCTTCGAGCTCTCGCGCCATGGGATGTGGAATCGTGTTCTGCTCGTCCTCTGTGCCTTTGCACTGCAGA"
            "ATATGTCGGGAGCTGCTGCTATCAACTACTATTCCCCCATACTCTTTGCGTCGTTGGGGATCACTGATGTCG"
            "CTCTGTATACAGGTATTTATGGCCTGGTAAAAGGTAAGTTCTTCTCCTTAAGTATCTCTGGCTGACAATAGG"
            "GATTAACTGATGAGTTTACAGCCGTCGCATCAATTATATTCTACGGCATTCTCATTGATATGTGGGGCCGCC"
            "GACGTCCGACCATTGTTTCGTCACTGGCCTGCCCTCTATGTCTCTGGTTTGTGGGTGCATACGTCAAAGTTG"
            "GGCATCCAGCCGATATCATAGACGCCGGCGGGGAATTGTCCCCCTCCACGGAGGCTGGTGGTAGAGCGGCGA"
            "CTGCGATGATTATGATCTACTCCGTCTTGTAAGTGCCCCTCACTTTTGAATGGGCTTCAGCTTGGAACTCGA"
            "GTAACTGGTATCCAGTTGGTCTTTTGGTCTCAACGGTATCCCCTGGATTGTCTCCGCCGAAATCTTCCCCGG"
            "CGCGCTGCGAAATCTCACGGGGACATGGGCTGCGCTGGTGCAATGGTATGCAATTCCCTTCACCTAGTATCC"
            "ATATCTAAATCAGCAGGTTGATCCAATTCGTTATCACCAAAGCTCTCCCGTACATCTTCAATAGCCTTGGGT"
            "ACGGGACGTGGTTCTTCTTCGCCTCCTGGATGCTGCTCGCTATCATTTGGTCATTCTTTTTTCTCCCGGAAA"
            "CCAAGGGGAAGACTCTCGATGAAATGCATACGATCTTGTACGTTTCTCTCCGTCGAAATGTGGTCTTGGCTA"
            "ATGAATCAGCGGCCATTCTCTCGCCGAAGAGCAGGGTAAGGGTGAGGTTCGAGATAACACTACTAAAAGTGA"
            "TCGGGAGGCTGTCTAGTCCAGTAGTTCTAGAGGACTATTGGCTGGATGATTCCTCTGATGATTTTTGATTGG"
            "TGGTGAAAATGTTGGATGTTTAATGCCAATGTACTGGGAGAGAACATGCCGATAGTACATACCGCTGTGTTG"
            "TATATCGAAGACGGTTGATTTATATATCTTAGTCTTTCAAAAGACGGCACTCACACAATCACACTTCGATGA")
     translation = ("MSPRLDYNEGYDSEDEEIPRYVHHSRGKSHRSVRTSGRSRTLDYDGDDEASDHAAPSGIDRDAR"
                    "ACPTSRRYTDDCLETHKFRGARSSRSRGRTDDNKVLYYTKYRSPAKDLPIERDPEGINLFKVRQ"
                    "HTRPSDAHVPSGYREPYEVKVDEYEDDHPRTCTSRRDSRQPKVYKVRVDEYEDNLPARSHTDFR"
                    "ESPRSERCSSRYTEDSKPGELPPRSGPCRSSRPSPVDEDVEYEIREPRGHRSSRHSTDVDFQPV"
                    "EQHPRFGQRGLSRPSRVDEEVDYEIREPRGNRVSHAAHGDSPCQDQSSRHIGIQLWTGVPVLSR"
                    "QLTHAISTPVNMFRTVEDRPTPKEVYNWRLYTEATIIATGTLLFGYDSAFVGTTIARQSFVDAF"
                    "NIVESEAADISSNITSTFQAGAFFGAIFCFLPEADAGRALTGIACGAITATVPSYIAELSIVSI"
                    "RGFLTGFFEVAYQIGSLVGFWINYGINENMDNSSAASWRVPMAVQIIPAGVLFIGGFSSMREDI"
                    "EMNRTRLLEEARIAEKYGQGWLAYIRGALFELSRHGMWNRVLLVLCAFALQNMSGAAAINYYSP"
                    "ILFASLGITDVALYTGIYGLVKAVASIIFYGILIDMWGRRRPTIVSSLACPLCLWFVGAYVKVG"
                    "HPADIIDAGGELSPSTEAGGRAATAMIMIYSVFWSFGLNGIPWIVSAEIFPGALRNLTGTWAAL"
                    "VQWLIQFVITKALPYIFNSLGYGTWFFFASWMLLAIIWSFFFLPETKGKTLDEMHTIFLSKDGT"
                    "HTITLR")
     new = cds.get_sub_location_from_protein_coordinates(353, 412)
     # pad the beginning to match the location
     assert new.extract(Seq("x" * location.start + seq)).translate() == translation[353:412]
예제 #14
0
def generate_domain_features(
        record: Record, gene: CDSFeature,
        domains: List[HMMResult]) -> Dict[HMMResult, AntismashDomain]:
    """ Generates AntismashDomain features for each provided HMMResult

        Arguments:
            record: the record the new features will belong to
            gene: the CDSFeature the domains were found in
            domains: a list of HMMResults found in the CDSFeature

        Returns:
            a dictionary mapping the HMMResult used to the matching AntismashDomain
    """
    new_features = {}
    nrat = 0
    nra = 0
    nrcal = 0
    nrkr = 0
    nrXdom = 0
    for domain in domains:
        loc = gene.get_sub_location_from_protein_coordinates(
            domain.query_start, domain.query_end)

        # set up new feature
        new_feature = AntismashDomain(loc)
        new_feature.domain = domain.hit_id
        new_feature.locus_tag = gene.locus_tag
        new_feature.detection = "hmmscan"
        new_feature.database = "nrpspksdomains.hmm"
        new_feature.evalue = domain.evalue
        new_feature.score = domain.bitscore

        transl_table = gene.transl_table or 1
        new_feature.translation = str(
            new_feature.extract(record.seq).translate(table=transl_table))

        if domain.hit_id == "AMP-binding":
            nra += 1
            domainname = "{}_A{}".format(gene.get_name(), nra)
            new_feature.label = domainname
            new_feature.domain_id = "nrpspksdomains_" + domainname
        elif domain.hit_id == "PKS_AT":
            nrat += 1
            domainname = "{}_AT{}".format(gene.get_name(), nrat)
            new_feature.label = domainname
            new_feature.domain_id = "nrpspksdomains_" + domainname
        elif domain.hit_id == "CAL_domain":
            nrcal += 1
            domainname = gene.get_name() + "_CAL" + str(nrcal)
            new_feature.label = domainname
            new_feature.domain_id = "nrpspksdomains_" + domainname
        elif domain.hit_id == "PKS_KR":
            nrkr += 1
            domainname = gene.get_name() + "_KR" + str(nrkr)
            new_feature.label = domainname
            new_feature.domain_id = "nrpspksdomains_" + domainname
        else:
            nrXdom += 1
            new_feature.domain_id = "nrpspksdomains_" + gene.get_name(
            ).partition(".")[0] + "_Xdom" + '{:02d}'.format(nrXdom)
        assert new_feature.get_name() not in new_features
        new_features[domain] = new_feature
    return new_features
예제 #15
0
 def test_simple_location_forward_complete(self):
     cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple")
     new = cds.get_sub_location_from_protein_coordinates(0, 5)
     extracted = new.extract(self.magic)
     assert extracted == self.magic
     assert extracted.translate() == self.translation
예제 #16
0
 def reverse_strand(self):
     self.magic = self.magic.reverse_complement()
     self.magic_split = self.magic_split.reverse_complement()
     self.sub_locations = [FeatureLocation(loc.start, loc.end, strand=-1) for loc in self.sub_locations]
     self.location = CompoundLocation(self.sub_locations)
     self.cds = CDSFeature(self.location, locus_tag="compound")
예제 #17
0
 def create_cds(self, start, end, strand=1):
     return CDSFeature(FeatureLocation(start, end, strand),
                       locus_tag="%d-%d" % (start, end))
예제 #18
0
class TestCDSProteinLocation(unittest.TestCase):
    def setUp(self):
        self.magic_split = Seq("ATGGCAxxxxxxGGTxxxxxxATTTGT")
        self.magic = Seq("ATGGCAGGTATTTGT")
        self.translation = "MAGIC"
        self.sub_locations = [FeatureLocation(0, 6, strand=1),
                              FeatureLocation(12, 15, strand=1),
                              FeatureLocation(21, 27, strand=1)]
        self.location = CompoundLocation(self.sub_locations)
        self.cds = CDSFeature(self.location, locus_tag="compound")

    def reverse_strand(self):
        self.magic = self.magic.reverse_complement()
        self.magic_split = self.magic_split.reverse_complement()
        self.sub_locations = [FeatureLocation(loc.start, loc.end, strand=-1) for loc in self.sub_locations]
        self.location = CompoundLocation(self.sub_locations)
        self.cds = CDSFeature(self.location, locus_tag="compound")

    def test_simple_location_forward_complete(self):
        cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple")
        new = cds.get_sub_location_from_protein_coordinates(0, 5)
        extracted = new.extract(self.magic)
        assert extracted == self.magic
        assert extracted.translate() == self.translation

    def test_simple_location_forward_partial(self):
        cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple")
        for start, end in [(1, 5), (0, 3), (2, 3), (1, 4)]:
            print("testing", start, end)
            new = cds.get_sub_location_from_protein_coordinates(start, end)
            print(new)
            extracted = new.extract(self.magic)
            assert extracted == self.magic[start * 3: end * 3]
            assert extracted.translate() == self.translation[start:end]

    def test_compound_location_forward_full(self):
        new = self.cds.get_sub_location_from_protein_coordinates(0, 5)
        assert isinstance(new, CompoundLocation)
        assert len(new.parts) == 3
        print(list(map(str, self.cds.location.parts)))
        print(list(map(str, new.parts)))
        assert len(new) == len(self.cds.location)
        assert new == self.location, "%s != %s" % (str(new), str(self.location))
        extracted = new.extract(self.magic_split)
        assert extracted == self.magic
        assert extracted.translate() == self.translation[0:5]

    def test_compound_forward_within_single(self):
        new = self.cds.get_sub_location_from_protein_coordinates(0, 2)
        assert isinstance(new, FeatureLocation)
        assert len(new) == 6
        assert new.start == 0
        assert new.end == 6
        assert new.extract(self.magic_split).translate() == self.translation[0:2]

        new = self.cds.get_sub_location_from_protein_coordinates(2, 3)
        assert isinstance(new, FeatureLocation)
        assert len(new) == 3
        assert new.start == 12
        assert new.end == 15
        assert new.extract(self.magic_split).translate() == self.translation[2:3]

    def test_compound_forward_over_multiple(self):
        new = self.cds.get_sub_location_from_protein_coordinates(2, 4)
        assert isinstance(new, CompoundLocation)
        print(list(map(str, self.cds.location.parts)))
        print(list(map(str, new.parts)))
        assert len(new.parts) == 2
        assert len(new) == 6
        assert new.parts[0].start == 12
        assert new.parts[0].end == 15
        assert new.parts[1].start == 21
        assert new.parts[1].end == 24
        assert new.extract(self.magic_split).translate() == self.translation[2:4]

    def test_compound_location_reverse_full(self):
        self.reverse_strand()
        for direction in [1, -1]:
            print("sub locations sorted in direction:", direction)
            cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound")
            new = cds.get_sub_location_from_protein_coordinates(0, 5)
            assert isinstance(new, CompoundLocation)
            assert len(new.parts) == 3
            print(list(map(str, cds.location.parts)))
            print(list(map(str, new.parts)))
            assert len(new) == len(cds.location)
            assert new.extract(self.magic_split).translate() == self.translation[0:5]

    def test_compound_location_reverse_single(self):
        self.reverse_strand()
        for direction in [1, -1]:
            print("sub locations sorted in direction:", direction)
            cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound")

            new = cds.get_sub_location_from_protein_coordinates(0, 2)
            assert isinstance(new, FeatureLocation)
            assert len(new) == 6
            assert new.start == 21
            assert new.end == 27
            assert new.extract(self.magic_split).translate() == self.translation[0:2]

            new = cds.get_sub_location_from_protein_coordinates(2, 3)
            assert isinstance(new, FeatureLocation)
            assert len(new) == 3
            assert new.start == 12
            assert new.end == 15
            assert new.extract(self.magic_split).translate() == self.translation[2:3]

    def test_compound_location_reverse_multiple(self):
        self.reverse_strand()
        for direction in [1, -1]:
            print("sub locations sorted in direction:", direction)
            cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound")

            new = cds.get_sub_location_from_protein_coordinates(2, 4)
            assert isinstance(new, CompoundLocation)
            print(list(map(str, cds.location.parts)))
            print(list(map(str, new.parts)))
            assert len(new.parts) == 2
            assert len(new) == 6
            assert new.parts[0].start == 12
            assert new.parts[0].end == 15
            assert new.parts[1].start == 3
            assert new.parts[1].end == 6
            assert new.extract(self.magic_split).translate() == self.translation[2:4]

    def test_complicated(self):
        parts = [FeatureLocation(121124, 122061, 1), FeatureLocation(122339, 122383, 1),
                 FeatureLocation(122559, 122666, 1), FeatureLocation(122712, 122874, 1),
                 FeatureLocation(123060, 123337, 1), FeatureLocation(123481, 123749, 1),
                 FeatureLocation(123809, 124032, 1), FeatureLocation(124091, 124193, 1),
                 FeatureLocation(124236, 124401, 1), FeatureLocation(124684, 124724, 1)]
        location = CompoundLocation(parts, operator="join")
        cds = CDSFeature(location, locus_tag="complicated")
        seq = ("ATGAGCCCTCGTCTAGACTACAATGAAGGATACGATTCCGAAGACGAGGAGATCCCCCGTTACGTACACCAT"
               "TCTAGAGGAAAGAGTCATAGATCCGTGAGGACGTCAGGTCGCTCACGCACGTTGGATTACGACGGGGATGAT"
               "GAAGCTAGTGACCACGCTGCCCCCTCCGGGATTGATCGGGACGCTCGAGCCTGTCCAACATCTCGCAGATAT"
               "ACTGATGACTGCCTTGAGACACATAAATTTCGAGGTGCCCGCTCCTCTCGCTCCCGTGGACGAACCGATGAT"
               "AACAAGGTTTTGTACTACACCAAGTATCGCAGCCCGGCTAAGGACTTGCCTATCGAGCGTGATCCCGAGGGT"
               "ATTAATTTATTCAAGGTCCGACAGCACACACGGCCAAGTGACGCTCATGTGCCCAGTGGATACCGTGAGCCC"
               "TACGAAGTCAAGGTCGACGAGTATGAGGATGATCATCCCCGTACATGCACTAGCCGCCGTGACTCTAGACAG"
               "CCGAAAGTCTACAAGGTCCGGGTTGATGAGTACGAGGATAACCTCCCTGCACGCTCTCACACTGACTTTCGC"
               "GAGTCTCCACGGTCTGAAAGATGCTCTAGCCGCTACACCGAGGACTCGAAGCCTGGGGAGCTTCCTCCCCGC"
               "TCAGGGCCCTGTCGGTCCAGCAGGCCTTCTCCGGTCGATGAGGACGTCGAGTATGAGATCCGTGAGCCCCGA"
               "GGGCATCGCTCCAGTCGACACTCTACAGATGTTGACTTTCAGCCAGTAGAACAACATCCTCGCTTTGGACAA"
               "CGTGGACTCAGCAGACCTTCGCGGGTTGATGAGGAAGTCGATTATGAGATCCGTGAGCCCCGTGGCAATCGT"
               "GTCAGTCACGCTGCTCATGGTGACAGCCCCTGTCAGGACCAAAGCTCCAGGCATATCGGCATTCAATTGTGG"
               "AGTACGCGCGGACCCCGGGCGGCTGGCCGTGGCCGGGGTCCTGATGAGTCTGACGATGTTGAGCCCTAGGCA"
               "GGGAATTGCCGTAATGCTCTTCAAACTGTATAGCAAGCTCAGCATCAATTCTTTAACTGGCAGGCGCTCTGC"
               "TCGCGCGTTTCTCTCTTGGGGTGGTTGGTTTGACTGTAGATTTCCTCTTTCAAGGCTTCTAGATACACCTTT"
               "GGAAGATAGCAACGCTATGCAAGATATTTTTGATAATTCAAATCCTTTTTACACATGGAATAGCTGGTGTTC"
               "CTGTTTTATCTAGGCAATTGACCCACGCCATCTCGGTAGGTACGGTAAAAGCAAGCCGTAATCTCGTATGGC"
               "TTCATCCTTAGCATCGTATAGATCTCCACTCGGGACTCGGCCAGGGATCTTCCATCAATCAACGTGAAGAAG"
               "TCCAGCACCCCGCTGAATCATAATATCCTACCGATTCTGCTCTCTTCACCTCTAGATACCCCTCTAGACTCC"
               "TGTCAACATGTTCCGTACAGTCGAAGACCGCCCGACCCCAAAAGAGGTATATAACTGGCGGCTGTACACCGA"
               "GGCCACCATCATTGCCACTGGTACACTCTTGTGAGTAGGTGCTGTTGTAACGAAAAACATCCAACTGATCCG"
               "CCAGGTTCGGCTATGACTCGGCTTTTGTGGGAACTACCATTGCCCGCCAAAGCTTCGTTGATGCCTTCAACA"
               "TCGTCGAGTCGGAGGCGGCGGATATTTCAAGCAATATCACGTCAACCTTTCAGGCCGGCGCATTTTTCGGCG"
               "CCATCTTCTGCTTCTTGCCTGAGTGAAGCCGTTAGAGACGGTCTCACTGGCTAACCGGACCAAGTGACCGAC"
               "AAAATTGGGCGTAAATGGGCCCTTCAGGCAAACACACTGCTGTTTCTTATTGGCGCGATTGTGATGACGGCT"
               "GCAACACATCACCTTTCCTATATATGTAAGTCATATCCCCGTAGTAGTCAAGGTTGTTAACTAGAGCAGATG"
               "CTGGACGAGCTCTCACCGGCATCGCATGCGGCGCTATCACCGCGACCGTCCCCAGCTATATTGCCGAGCTGT"
               "CAATCGTGTCGATCCGGGGCTTCCTCACCGGGTTCTTCGAAGTCGCATACCAGATTGGTAGCTTGGTTGGAT"
               "TCTGGATCAACTATGGCATTAACGAGAACATGGACAACTCCTCGGCCGCAAGCTGGAGAGTGCCTATGGCAG"
               "TCCAGATCATCCCCGCAGGAGTCCTTTTCATTGGTGGCTTTTCCTCCATGAGAGTCCTCTCTGGCTGATGCG"
               "AAAAGACAGTGAGGATGCCGCGACGGCTGCCCTGGAGGCGTTGAGGAAACTGCCACGGTCTCATCAATGTAA"
               "TCTCCCACCAAGACTCAGGACATAGTCCCATGCTGACTATTTTAGATGTCCAGGAAGACATCGAGATGAACC"
               "GCACCAGGCTGCTGGAGGAAGCTCGGATCGCCGAGAAGTACGGACAAGGTTGGTTGGCATATATCCGAGGCG"
               "CACTCTTCGAGCTCTCGCGCCATGGGATGTGGAATCGTGTTCTGCTCGTCCTCTGTGCCTTTGCACTGCAGA"
               "ATATGTCGGGAGCTGCTGCTATCAACTACTATTCCCCCATACTCTTTGCGTCGTTGGGGATCACTGATGTCG"
               "CTCTGTATACAGGTATTTATGGCCTGGTAAAAGGTAAGTTCTTCTCCTTAAGTATCTCTGGCTGACAATAGG"
               "GATTAACTGATGAGTTTACAGCCGTCGCATCAATTATATTCTACGGCATTCTCATTGATATGTGGGGCCGCC"
               "GACGTCCGACCATTGTTTCGTCACTGGCCTGCCCTCTATGTCTCTGGTTTGTGGGTGCATACGTCAAAGTTG"
               "GGCATCCAGCCGATATCATAGACGCCGGCGGGGAATTGTCCCCCTCCACGGAGGCTGGTGGTAGAGCGGCGA"
               "CTGCGATGATTATGATCTACTCCGTCTTGTAAGTGCCCCTCACTTTTGAATGGGCTTCAGCTTGGAACTCGA"
               "GTAACTGGTATCCAGTTGGTCTTTTGGTCTCAACGGTATCCCCTGGATTGTCTCCGCCGAAATCTTCCCCGG"
               "CGCGCTGCGAAATCTCACGGGGACATGGGCTGCGCTGGTGCAATGGTATGCAATTCCCTTCACCTAGTATCC"
               "ATATCTAAATCAGCAGGTTGATCCAATTCGTTATCACCAAAGCTCTCCCGTACATCTTCAATAGCCTTGGGT"
               "ACGGGACGTGGTTCTTCTTCGCCTCCTGGATGCTGCTCGCTATCATTTGGTCATTCTTTTTTCTCCCGGAAA"
               "CCAAGGGGAAGACTCTCGATGAAATGCATACGATCTTGTACGTTTCTCTCCGTCGAAATGTGGTCTTGGCTA"
               "ATGAATCAGCGGCCATTCTCTCGCCGAAGAGCAGGGTAAGGGTGAGGTTCGAGATAACACTACTAAAAGTGA"
               "TCGGGAGGCTGTCTAGTCCAGTAGTTCTAGAGGACTATTGGCTGGATGATTCCTCTGATGATTTTTGATTGG"
               "TGGTGAAAATGTTGGATGTTTAATGCCAATGTACTGGGAGAGAACATGCCGATAGTACATACCGCTGTGTTG"
               "TATATCGAAGACGGTTGATTTATATATCTTAGTCTTTCAAAAGACGGCACTCACACAATCACACTTCGATGA")
        translation = ("MSPRLDYNEGYDSEDEEIPRYVHHSRGKSHRSVRTSGRSRTLDYDGDDEASDHAAPSGIDRDAR"
                       "ACPTSRRYTDDCLETHKFRGARSSRSRGRTDDNKVLYYTKYRSPAKDLPIERDPEGINLFKVRQ"
                       "HTRPSDAHVPSGYREPYEVKVDEYEDDHPRTCTSRRDSRQPKVYKVRVDEYEDNLPARSHTDFR"
                       "ESPRSERCSSRYTEDSKPGELPPRSGPCRSSRPSPVDEDVEYEIREPRGHRSSRHSTDVDFQPV"
                       "EQHPRFGQRGLSRPSRVDEEVDYEIREPRGNRVSHAAHGDSPCQDQSSRHIGIQLWTGVPVLSR"
                       "QLTHAISTPVNMFRTVEDRPTPKEVYNWRLYTEATIIATGTLLFGYDSAFVGTTIARQSFVDAF"
                       "NIVESEAADISSNITSTFQAGAFFGAIFCFLPEADAGRALTGIACGAITATVPSYIAELSIVSI"
                       "RGFLTGFFEVAYQIGSLVGFWINYGINENMDNSSAASWRVPMAVQIIPAGVLFIGGFSSMREDI"
                       "EMNRTRLLEEARIAEKYGQGWLAYIRGALFELSRHGMWNRVLLVLCAFALQNMSGAAAINYYSP"
                       "ILFASLGITDVALYTGIYGLVKAVASIIFYGILIDMWGRRRPTIVSSLACPLCLWFVGAYVKVG"
                       "HPADIIDAGGELSPSTEAGGRAATAMIMIYSVFWSFGLNGIPWIVSAEIFPGALRNLTGTWAAL"
                       "VQWLIQFVITKALPYIFNSLGYGTWFFFASWMLLAIIWSFFFLPETKGKTLDEMHTIFLSKDGT"
                       "HTITLR")
        new = cds.get_sub_location_from_protein_coordinates(353, 412)
        # pad the beginning to match the location
        assert new.extract(Seq("x" * location.start + seq)).translate() == translation[353:412]