def test_annot_in0(self):
        """
        Test func_annot when the integron is a in0: only an integrase. There are no proteins
        to annotate, but the _subseqprot.tmp file already exists (not deleted in a last run
        for example...)
        """
        # create empty _subseqprot.tmp file (must be deleted by func_annot)
        open(os.path.join(self.tmp_dir, "{}_subseqprot.tmp".format(self.replicon.id)), "w").close()
        # Create integron
        integron1 = Integron(self.replicon, self.cfg)
        integrons = [integron1]
        # Add integrase
        integron1.add_integrase(55, 1014, "ACBA.007.P01_13_1", 1, 1.9e-25, "intersection_tyr_intI")
        # check proteins before annotation
        proteins = pd.DataFrame(columns=["pos_beg", "pos_end", "strand",
                                         "evalue", "type_elt", "model",
                                         "distance_2attC", "annotation"])
        proteins = proteins.astype(dtype={"pos_beg": "int", "pos_end": "int", "strand": "int",
                                          "evalue": "float", "type_elt": "str", "model": "str",
                                          "distance_2attC": "float", "annotation": "str"})
        pdt.assert_frame_equal(proteins, integron1.proteins)

        # Annotate proteins
        func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir)
        # Check that all files generated are as expected
        files_created = [f for f in glob.glob(os.path.join(self.tmp_dir, "*")) if os.path.isfile(f)]
        exp_files = ["{}{}".format(self.replicon.id, suffix) for suffix in ("_intI_table.res",
                                                                            "_phage_int_table.res",
                                                                            "_intI.res",
                                                                            "_phage_int.res")]
        exp_files = [os.path.join(self.tmp_dir, file) for file in exp_files]
        self.assertEqual(set(exp_files), set(files_created))
        # check proteins after annotation
        pdt.assert_frame_equal(proteins, integron1.proteins)
    def test_has_integrase(self):
        replicon = SeqRecord(Seq.Seq(''), id='foo', name='bar')
        integron = Integron(replicon, self.cfg)
        self.assertFalse(integron.has_integrase())

        replicon = SeqRecord(Seq.Seq(''), id='just_one_integrase', name='bar')
        just_one_integrase = Integron(replicon, self.cfg)
        just_one_integrase.add_integrase(10, 100, 'foo', 1, 1e-2,
                                         "intersection_tyr_intI")
        self.assertTrue(just_one_integrase.has_integrase())
Exemple #3
0
    def test_add_integrase(self):
        replicon_name = "acba.007.p01.13"
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        data_integrase = {"pos_beg": 55,
                          "pos_end": 1014,
                          "strand": 1,
                          "evalue": 1.900000e-25,
                          "type_elt": "protein",
                          "annotation": "intI",
                          "model": "intersection_tyr_intI",
                          "distance_2attC": np.nan}
        id_int = "ACBA.007.P01_13_1"

        df = pd.DataFrame(data_integrase,
                          columns=self.columns,
                          index=[id_int])
        df = df.astype(dtype=self.dtype)

        integron = Integron(replicon, self.cfg)
        integron.add_integrase(data_integrase["pos_beg"],
                               data_integrase["pos_end"],
                               id_int,
                               data_integrase["strand"],
                               data_integrase["evalue"],
                               data_integrase["model"]
                               )
        pdt.assert_frame_equal(df, integron.integrase)

        with self.assertRaises(RuntimeError) as ctx:
            integron.add_integrase(data_integrase["pos_beg"],
                                   data_integrase["pos_end"],
                                   id_int,
                                   data_integrase["strand"],
                                   data_integrase["evalue"],
                                   data_integrase["model"]
                                   )
        self.assertEqual(str(ctx.exception), "add_integrase should be called once.")
Exemple #4
0
    def test_type(self):
        replicon = SeqRecord(Seq.Seq(''), id='foo')
        no_integrase = Integron(replicon, self.cfg)
        self.assertIsNone(no_integrase.type())

        replicon = SeqRecord(Seq.Seq(''), id='just_one_integrase')
        just_one_integrase = Integron(replicon, self.cfg)
        just_one_integrase.add_integrase(10,
                                         100,
                                         'foo',
                                         1,
                                         1e-2,
                                         "intersection_tyr_intI")
        self.assertEqual(just_one_integrase.type(), "In0")

        replicon = SeqRecord(Seq.Seq(''), id='just_one_attC')
        just_one_attC = Integron(replicon, self.cfg)
        just_one_attC.add_attC(10,
                               100,
                               1,
                               1e-2,
                               "intersection_tyr_intI")
        self.assertEqual(just_one_attC.type(), "CALIN")

        replicon = SeqRecord(Seq.Seq(''), id='one_integrase_one_attC')
        one_integrase_one_attC = Integron(replicon, self.cfg)
        one_integrase_one_attC.add_integrase(10,
                                             100,
                                             'foo',
                                             1,
                                             1e-2,
                                             "intersection_tyr_intI")
        one_integrase_one_attC.add_attC(10,
                                        100,
                                        1,
                                        1e-2,
                                        "intersection_tyr_intI")
        self.assertEqual(one_integrase_one_attC.type(), "complete")
    def test_annot_multi(self):
        """
        Test func_annot when there are 4 integrons:
        - 1 calin with 4 proteins, 2 having a resfam annotation
        - 1 calin with 2 proteins, none having a resfam annotation
        - 1 in0
        - 1 complete with 4 proteins, 3 having a resfam annotation
        """
        # resfam pour: 16, 13, 3, 12

        # Create integron in0
        integron1 = Integron(self.replicon.name, self.cfg)
        integron1.add_integrase(56, 1014, "ACBA.007.P01_13_1", 1, 1.9e-25, "intersection_tyr_intI")

        # Create integron CALIN with resfam proteins
        integron2 = Integron(self.replicon, self.cfg)
        integron2.add_attC(7400, 7650, -1, 7e-9, "attc_4")
        integron2.add_attC(8600, 8650, -1, 7e-4, "attc_4")
        integron2.add_attC(10200, 10400, -1, 7e-7, "attc_4")
        integron2.add_attC(10800, 10900, -1, 7e-7, "attc_4")
        integron2.add_proteins(self.prot_db)

        # Create integron CALIN without any resfam proteins
        integron3 = Integron(self.replicon, self.cfg)
        integron3.add_attC(4320, 4400, -1, 7e-9, "attc_4")
        integron3.add_proteins(self.prot_db)

        # Create complete integron
        integron4 = Integron(self.replicon, self.cfg)
        integron4.add_attC(17825, 17884, -1, 7e-9, "attc_4")
        integron4.add_attC(19080, 19149, -1, 7e-4, "attc_4")
        integron4.add_attC(19618, 19726, -1, 7e-7, "attc_4")
        integron4.add_integrase(16542, 17381, "ACBA.007.P01_13_19", -1, 1.9e-25, "intersection_tyr_intI")
        integron4.add_proteins(self.prot_db)

        integrons = [integron1, integron2, integron3, integron4]

        # Create dataframes for expected proteins before annotation
        proteins1 = pd.DataFrame(columns=["pos_beg", "pos_end", "strand",
                                          "evalue", "type_elt", "model",
                                          "distance_2attC", "annotation"])
        proteins1 = proteins1.astype(dtype={"pos_beg": "int", "pos_end": "int", "strand": "int",
                                            "evalue": "float", "type_elt": "str", "model": "str",
                                            "distance_2attC": "float", "annotation": "str"})
        proteins1 = proteins1[["pos_beg", "pos_end", "strand", "evalue", "type_elt",
                               "model", "distance_2attC", "annotation"]]
        proteins1 = proteins1.astype(dtype=self.prot_dtype)

        proteins2 = pd.DataFrame({"pos_beg": [7088, 7710, 8650, 10524],
                                  "pos_end": [7351, 8594, 10125, 11699],
                                  "strand": [1, -1, -1, -1],
                                  "evalue": [np.nan] * 4,
                                  "type_elt": ["protein"] * 4,
                                  "model": ["NA"] * 4,
                                  "distance_2attC": [np.nan] * 4,
                                  "annotation": ["protein"] * 4},
                                 index=["ACBA.007.P01_13_11", "ACBA.007.P01_13_12",
                                        "ACBA.007.P01_13_13", "ACBA.007.P01_13_14"])
        proteins2 = proteins2[["pos_beg", "pos_end", "strand", "evalue", "type_elt",
                               "model", "distance_2attC", "annotation"]]
        proteins2 = proteins2.astype(dtype=self.prot_dtype)

        proteins3 = pd.DataFrame({"pos_beg": [3546, 4380],
                                  "pos_end": [4313, 4721],
                                  "strand": [1, 1],
                                  "evalue": [np.nan] * 2,
                                  "type_elt": ["protein"] * 2,
                                  "model": ["NA"] * 2,
                                  "distance_2attC": [np.nan] * 2,
                                  "annotation": ["protein"] * 2},
                                 index=["ACBA.007.P01_13_6", "ACBA.007.P01_13_7"])
        proteins3 = proteins3[["pos_beg", "pos_end", "strand", "evalue", "type_elt",
                               "model", "distance_2attC", "annotation"]]
        proteins3 = proteins3.astype(dtype=self.prot_dtype)

        proteins4 = pd.DataFrame({"pos_beg": [17375, 17886, 19090, 19721],
                                  "pos_end": [17722, 18665, 19749, 20254],
                                  "strand": [-1] * 4,
                                  "evalue": [np.nan] * 4,
                                  "type_elt": ["protein"] * 4,
                                  "model": ["NA"] * 4,
                                  "distance_2attC": [np.nan] * 4,
                                  "annotation": ["protein"] * 4},
                                 index=["ACBA.007.P01_13_20", "ACBA.007.P01_13_21",
                                        "ACBA.007.P01_13_22", "ACBA.007.P01_13_23"])
        proteins4 = proteins4[["pos_beg", "pos_end", "strand", "evalue", "type_elt",
                               "model", "distance_2attC", "annotation"]]
        proteins4 = proteins4.astype(dtype=self.prot_dtype)

        # Check proteins before annotation
        expected_proteins = [proteins1, proteins2, proteins3, proteins4]

        for inte, exp_prot in zip(integrons, expected_proteins):
            # we need to sort the dataframe
            # as protein file is parse using biopython and index
            # the order os sequences is not guarantee
            pdt.assert_frame_equal(inte.proteins.sort_index(), exp_prot.sort_index())

        # Annotate proteins with evalue threshold
        func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir, evalue=1e-32)

        # Check that all files generated are as expected
        files_created = [f for f in glob.glob(os.path.join(self.tmp_dir, "*")) if os.path.isfile(f)]
        self.assertEqual(set(self.exp_files), set(files_created))

        # Check that annotated proteins are as expected
        proteins2.loc["ACBA.007.P01_13_13"] = [8650, 10125, -1, 2.4e-86, "protein",
                                               "RF0007", np.nan, "ABC_efflux"]
        proteins4.loc["ACBA.007.P01_13_21"] = [17886, 18665, -1, 7.4e-168, "protein",
                                               "RF0027", np.nan, "ANT3"]
        proteins4.loc["ACBA.007.P01_13_23"] = [19721, 20254, -1, 6.2e-110, "protein",
                                               "RF0003", np.nan, "AAC3-I"]
        for inte, prots in zip(integrons, expected_proteins):
            # we need to sort the dataframe
            # as protein file is parse using biopython and index
            # the order os sequences is not guarantee
            pdt.assert_frame_equal(inte.proteins.sort_index(), prots.sort_index())

        # Annotate proteins with default evalue (1 more annotation)
        with self.catch_io(out=True):
            func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir)
        proteins4.loc["ACBA.007.P01_13_20"] = [17375, 17722, -1, 4.5e-31, "protein",
                                               "RF0066", np.nan, "emrE"]
        for inte, prots in zip(integrons, expected_proteins):
            pdt.assert_frame_equal(inte.proteins.sort_index(), prots.sort_index())

        # Annotate proteins with lower coverage threshold (1 more annotation)
        with self.catch_io(out=True):
            func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir, coverage=0.4)

        proteins2.loc["ACBA.007.P01_13_12"] = [7710, 8594, -1, 1.6e-5, "protein",
                                               "RF0033", np.nan, "APH3"]
        for inte, prots in zip(integrons, expected_proteins):
            pdt.assert_frame_equal(inte.proteins.sort_index(), prots.sort_index())