def test_annot_in0(self):
        """
        Test func_annot when the integron is a in0: only an integrase. There are no proteins
        to annotate, but the _subseqprot.tmp file already exists (not deleted in a last run
        for example...)
        """
        # create empty _subseqprot.tmp file (must be deleted by func_annot)
        open(os.path.join(self.tmp_dir, "{}_subseqprot.tmp".format(self.replicon.id)), "w").close()
        # Create integron
        integron1 = Integron(self.replicon, self.cfg)
        integrons = [integron1]
        # Add integrase
        integron1.add_integrase(55, 1014, "ACBA.007.P01_13_1", 1, 1.9e-25, "intersection_tyr_intI")
        # check proteins before annotation
        proteins = pd.DataFrame(columns=["pos_beg", "pos_end", "strand",
                                         "evalue", "type_elt", "model",
                                         "distance_2attC", "annotation"])
        proteins = proteins.astype(dtype={"pos_beg": "int", "pos_end": "int", "strand": "int",
                                          "evalue": "float", "type_elt": "str", "model": "str",
                                          "distance_2attC": "float", "annotation": "str"})
        pdt.assert_frame_equal(proteins, integron1.proteins)

        # Annotate proteins
        func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir)
        # Check that all files generated are as expected
        files_created = [f for f in glob.glob(os.path.join(self.tmp_dir, "*")) if os.path.isfile(f)]
        exp_files = ["{}{}".format(self.replicon.id, suffix) for suffix in ("_intI_table.res",
                                                                            "_phage_int_table.res",
                                                                            "_intI.res",
                                                                            "_phage_int.res")]
        exp_files = [os.path.join(self.tmp_dir, file) for file in exp_files]
        self.assertEqual(set(exp_files), set(files_created))
        # check proteins after annotation
        pdt.assert_frame_equal(proteins, integron1.proteins)
    def test_find_attc_max_In0(self):
        replicon_name = 'ESCO001.B.00018.P002'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))

        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        integron = Integron(replicon, self.cfg)

        integrase = pd.DataFrame({'pos_beg': [90229],
                                  'pos_end': [91242],
                                  'strand': -1,
                                  'evalue': 1.400000e-24,
                                  'type_elt': 'protein',
                                  'annotation': 'intI',
                                  'model': 'intersection_tyr_intI',
                                  'distance_2attC': np.nan
                                  },
                                 index=['ESCO001.B.00018.P002_106'],
                                 columns=self.columns)
        integrase = integrase.astype(dtype=self.dtype)
        integron.integrase = integrase
        integrons = [integron]

        max_final = find_attc_max(integrons, replicon,
                                  self.cfg.distance_threshold, self.cfg.model_attc_path,
                                  self.cfg.max_attc_size, self.cfg.min_attc_size,
                                  circular=True,
                                  out_dir=self.tmp_dir)

        exp = pd.DataFrame(columns=self.max_cols)
        exp = exp.astype(dtype=self.max_dtype)
        pdt.assert_frame_equal(max_final, exp)
    def test_annot_calin_empty(self):
        """
        Test func_annot when the integron is a CALIN (attC but no integrase), without any protein:
        nothing to annotate
        """
        # Create integron
        integron1 = Integron(self.replicon, self.cfg)
        integrons = [integron1]
        # Add only attc sites (no integrase)
        integron1.add_attC(17825, 17884, -1, 7e-9, "attc_4")
        integron1.add_attC(19080, 19149, -1, 7e-4, "attc_4")
        integron1.add_attC(19618, 19726, -1, 7e-7, "attc_4")

        # check proteins before annotation
        proteins = pd.DataFrame(columns=["pos_beg", "pos_end", "strand",
                                         "evalue", "type_elt", "model",
                                         "distance_2attC", "annotation"])
        proteins = proteins.astype(dtype={"pos_beg": "int", "pos_end": "int", "strand": "int",
                                          "evalue": "float", "type_elt": "str", "model": "str",
                                          "distance_2attC": "float", "annotation": "str"})
        pdt.assert_frame_equal(proteins, integron1.proteins)

        # Annotate proteins
        func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir)

        # Check that all files generated are as expected
        files_created = [f for f in glob.glob(os.path.join(self.tmp_dir, "*")) if os.path.isfile(f)]
        exp_files = ["{}{}".format(self.replicon.id, suffix) for suffix in ("_intI_table.res",
                                                                            "_phage_int_table.res",
                                                                            "_intI.res",
                                                                            "_phage_int.res")]
        exp_files = [os.path.join(self.tmp_dir, file) for file in exp_files]
        self.assertEqual(set(exp_files), set(files_created))

        # Check proteins after annotation
        pdt.assert_frame_equal(proteins, integron1.proteins)
    def test_annot_calin(self):
        """
        Test func_annot when the integron is a CALIN (attC but no integrase), with 4 proteins:
        for 3 of them resfam annotations are found, and not for the last 1.
        """
        # Create integron
        integron1 = Integron(self.replicon, self.cfg)
        integrons = [integron1]
        # Add only attc sites (no integrase)
        integron1.add_attC(17825, 17884, -1, 7e-9, "attc_4")
        integron1.add_attC(19080, 19149, -1, 7e-4, "attc_4")
        integron1.add_attC(19618, 19726, -1, 7e-7, "attc_4")
        # Add proteins between attC sites
        integron1.add_proteins(self.prot_db)
        # Check that proteins dataframe is as expected before annotation
        proteins = pd.DataFrame({"pos_beg": [17375, 17886, 19090, 19721],
                                 "pos_end": [17722, 18665, 19749, 20254],
                                 "strand": [-1] * 4,
                                 "evalue": [np.nan] * 4,
                                 "type_elt": ["protein"] * 4,
                                 "model": ["NA"] * 4,
                                 "distance_2attC": [np.nan] * 4,
                                 "annotation": ["protein"] * 4},
                                 index=["ACBA.007.P01_13_20", "ACBA.007.P01_13_21",
                                        "ACBA.007.P01_13_22", "ACBA.007.P01_13_23"])
        proteins = proteins[["pos_beg", "pos_end", "strand", "evalue", "type_elt",
                             "model", "distance_2attC", "annotation"]]
        # we need to sort the dataframe
        # as protein file is parse using biopython and index
        # the order os sequences is not guarantee
        pdt.assert_frame_equal(proteins.sort_index(), integron1.proteins.sort_index())

        # Annotate proteins
        func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir)

        # Check that all files generated are as expected
        files_created = [f for f in glob.glob(os.path.join(self.tmp_dir, "*")) if os.path.isfile(f)]
        self.assertEqual(set(self.exp_files), set(files_created))

        # Check that annotated proteins are as expected
        proteins.loc["ACBA.007.P01_13_20"] = [17375, 17722, -1, 4.5e-31, "protein", "RF0066", np.nan, "emrE"]
        proteins.loc["ACBA.007.P01_13_21"] = [17886, 18665, -1, 7.4e-168, "protein", "RF0027", np.nan, "ANT3"]
        proteins.loc["ACBA.007.P01_13_23"] = [19721, 20254, -1, 6.2e-110, "protein", "RF0003", np.nan, "AAC3-I"]
        # we need to sort the dataframe
        # as protein file is parse using biopython and index
        # the order os sequences is not guarantee
        pdt.assert_frame_equal(proteins.sort_index(), integron1.proteins.sort_index())
 def test_annot_wrong_hmmsearch(self):
     """
     Test that when the given HMMSEARCH command does not exist, it raises an exception
     specifying that the given command could not run.
     """
     self.cfg._args.hmmsearch = "nimportnaoik"
     # Create integron
     integron1 = Integron(self.replicon.name, self.cfg)
     integrons = [integron1]
     # Add only attc sites (no integrase)
     integron1.add_attC(17825, 17884, -1, 7e-9, "attc_4")
     integron1.add_attC(19080, 19149, -1, 7e-4, "attc_4")
     integron1.add_attC(19618, 19726, -1, 7e-7, "attc_4")
     # Add proteins between attC sites
     integron1.add_proteins(self.prot_db)
     # Annotate proteins
     with self.assertRaises(RuntimeError) as ctx:
         func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir)
     self.assertTrue(re.search("failed : \[Errno 2\] No such file or directory: 'nimportnaoik'", str(ctx.exception)))
 def test_annot_wrong_hmm(self):
     """
     Test that when the given hmm file does not exist, it returns an error specifying that
     the hmm command ended with a non-zero return code.
     """
     wrong_hmm_files = ["myhmm.hmm"]
     # Create integron
     integron1 = Integron(self.replicon, self.cfg)
     integrons = [integron1]
     # Add only attc sites (no integrase)
     integron1.add_attC(17825, 17884, -1, 7e-9, "attc_4")
     integron1.add_attC(19080, 19149, -1, 7e-4, "attc_4")
     integron1.add_attC(19618, 19726, -1, 7e-7, "attc_4")
     # Add proteins between attC sites
     integron1.add_proteins(self.prot_db)
     # Annotate proteins
     with self.assertRaises(RuntimeError) as ctx:
         func_annot(integrons, self.replicon, self.prot_db, wrong_hmm_files, self.cfg, self.tmp_dir)
     self.assertTrue(str(ctx.exception).endswith(" failed return code = 1"))
Exemple #7
0
    def test_add_integrase(self):
        replicon_name = "acba.007.p01.13"
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        data_integrase = {"pos_beg": 55,
                          "pos_end": 1014,
                          "strand": 1,
                          "evalue": 1.900000e-25,
                          "type_elt": "protein",
                          "annotation": "intI",
                          "model": "intersection_tyr_intI",
                          "distance_2attC": np.nan}
        id_int = "ACBA.007.P01_13_1"

        df = pd.DataFrame(data_integrase,
                          columns=self.columns,
                          index=[id_int])
        df = df.astype(dtype=self.dtype)

        integron = Integron(replicon, self.cfg)
        integron.add_integrase(data_integrase["pos_beg"],
                               data_integrase["pos_end"],
                               id_int,
                               data_integrase["strand"],
                               data_integrase["evalue"],
                               data_integrase["model"]
                               )
        pdt.assert_frame_equal(df, integron.integrase)

        with self.assertRaises(RuntimeError) as ctx:
            integron.add_integrase(data_integrase["pos_beg"],
                                   data_integrase["pos_end"],
                                   id_int,
                                   data_integrase["strand"],
                                   data_integrase["evalue"],
                                   data_integrase["model"]
                                   )
        self.assertEqual(str(ctx.exception), "add_integrase should be called once.")
    def test_integrons_report(self):
        replicon_name = "acba.007.p01.13"
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        args = argparse.Namespace()
        cfg = Config(args)
        cfg._args.eagle_eyes = False
        cfg._args.eagle_eyes = False
        cfg._args.local_max = False

        integron = Integron(replicon, cfg)
        columns = ['pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation']
        dtype = {"pos_beg": 'int',
                      "pos_end": 'int',
                      "strand": 'int',
                      "evalue": 'float',
                      "type_elt": 'str',
                      "annotation": 'str',
                      "model": 'str',
                      "distance_2attC": 'float'}
        data_integrase = {"pos_beg": 55,
                          "pos_end": 1014,
                          "strand": 1,
                          "evalue": 1.900000e-25,
                          "type_elt": "protein",
                          "annotation": "intI",
                          "model": "intersection_tyr_intI",
                          "distance_2attC": np.nan}
        id_int = "ACBA.007.P01_13_1"

        integrase = pd.DataFrame(data_integrase, columns=columns, index=[id_int])
        integrase = integrase.astype(dtype=dtype)

        data_attc = {"pos_beg": [17825, 19080, 19618],
                     "pos_end": [17884, 19149, 19726],
                     "strand": [-1] * 3,
                     "evalue": [1.000000e-09, 1.000000e-04, 1.100000e-07],
                     "type_elt": ["attC"] * 3,
                     "annotation": ["attC"] * 3,
                     "model": ["attc_4"] * 3,
                     "distance_2attC": [np.nan, 1196.0, 469.0]}

        attC = pd.DataFrame(data_attc,
                            columns=columns,
                            index=['attc_00{}'.format(i) for i in range(1, 4)])
        attC = attC.astype(dtype=dtype)

        promoter = pd.DataFrame({'pos_beg': 25,
                                 'pos_end': 51,
                                 'strand': -1,
                                 'evalue': np.nan,
                                 'type_elt': 'Promoter',
                                 'annotation': 'Pc_1',
                                 'model': np.nan,
                                 'distance_2attC': np.nan
                                 },
                                index=['Pc_int1'],
                                columns=columns
                                )
        promoter = promoter.astype(dtype=dtype)

        proteins = pd.DataFrame({'pos_beg': [17375, 17886, 19090, 19721],
                                 'pos_end': [17722, 18665, 19749, 20254],
                                 'strand': [-1] * 4,
                                 'evalue': [np.nan] * 4,
                                 'type_elt': ['protein'] * 4,
                                 'annotation': ['protein'] * 4,
                                 'model': [np.nan] * 4,
                                 'distance_2attC': [np.nan] * 4
                                 },
                                index=['ACBA.007.P01_13_2{}'.format(i) for i in range(0, 4)],
                                columns=columns
                                )
        proteins = proteins.astype(dtype=dtype)

        integron.integrase = integrase
        integron.attC = attC
        integron.promoter = promoter
        integron.proteins = proteins
        report = results.integrons_report([integron])
        exp_report = pd.read_csv(
            self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                                        '{}.integrons'.format(replicon_name)
                                        )),
            sep="\t"
        )
        exp_report = exp_report.astype(dtype=dtype)
        pdt.assert_frame_equal(exp_report, report)
    def setUp(self):
        if 'INTEGRON_HOME' in os.environ:
            self.integron_home = os.environ['INTEGRON_HOME']
            self.local_install = True
        else:
            self.local_install = False
            self.integron_home = os.path.normpath(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__), '..', '..')))

        self.tmp_dir = os.path.join(tempfile.gettempdir(),
                                    'tmp_test_integron_finder')
        if os.path.exists(self.tmp_dir) and os.path.isdir(self.tmp_dir):
            shutil.rmtree(self.tmp_dir)
        os.makedirs(self.tmp_dir)

        args = argparse.Namespace()
        args.attc_model = 'attc_4.cm'
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.eagle_eyes = False
        args.local_max = False
        self.cfg = Config(args)
        self.cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'OBAL001.B.00005.C001'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))

        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            self.replicon = next(sequences_db)

        self.integron = Integron(self.replicon, self.cfg)

        self.columns = [
            'pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model',
            'distance_2attC', 'annotation'
        ]
        self.dtype = {
            "pos_beg": 'int',
            "pos_end": 'int',
            "strand": 'int',
            "evalue": 'float',
            "type_elt": 'str',
            "annotation": 'str',
            "model": 'str',
            "distance_2attC": 'float'
        }

        self.max_dtype = {
            'Accession_number': 'str',
            'cm_attC': 'str',
            'cm_debut': 'int',
            'cm_fin': 'int',
            'pos_beg': 'int',
            'pos_end': 'int',
            'sens': 'str',
            'evalue': 'float'
        }
        self.max_cols = [
            'Accession_number', 'cm_attC', 'cm_debut', 'cm_fin', 'pos_beg',
            'pos_end', 'sens', 'evalue'
        ]
    def test_integrons_report(self):
        replicon_name = "acba.007.p01.13"
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        args = argparse.Namespace()
        cfg = Config(args)
        cfg._args.eagle_eyes = False
        cfg._args.eagle_eyes = False
        cfg._args.local_max = False

        integron = Integron(replicon, cfg)
        columns = [
            'pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model',
            'distance_2attC', 'annotation'
        ]
        dtype = {
            "pos_beg": 'int',
            "pos_end": 'int',
            "strand": 'int',
            "evalue": 'float',
            "type_elt": 'str',
            "annotation": 'str',
            "model": 'str',
            "distance_2attC": 'float'
        }
        data_integrase = {
            "pos_beg": 55,
            "pos_end": 1014,
            "strand": 1,
            "evalue": 1.900000e-25,
            "type_elt": "protein",
            "annotation": "intI",
            "model": "intersection_tyr_intI",
            "distance_2attC": np.nan
        }
        id_int = "ACBA.007.P01_13_1"

        integrase = pd.DataFrame(data_integrase,
                                 columns=columns,
                                 index=[id_int])
        integrase = integrase.astype(dtype=dtype)

        data_attc = {
            "pos_beg": [17825, 19080, 19618],
            "pos_end": [17884, 19149, 19726],
            "strand": [-1] * 3,
            "evalue": [1.000000e-09, 1.000000e-04, 1.100000e-07],
            "type_elt": ["attC"] * 3,
            "annotation": ["attC"] * 3,
            "model": ["attc_4"] * 3,
            "distance_2attC": [np.nan, 1196.0, 469.0]
        }

        attC = pd.DataFrame(data_attc,
                            columns=columns,
                            index=['attc_00{}'.format(i) for i in range(1, 4)])
        attC = attC.astype(dtype=dtype)

        promoter = pd.DataFrame(
            {
                'pos_beg': 25,
                'pos_end': 51,
                'strand': -1,
                'evalue': np.nan,
                'type_elt': 'Promoter',
                'annotation': 'Pc_1',
                'model': np.nan,
                'distance_2attC': np.nan
            },
            index=['Pc_int1'],
            columns=columns)
        promoter = promoter.astype(dtype=dtype)

        proteins = pd.DataFrame(
            {
                'pos_beg': [17375, 17886, 19090, 19721],
                'pos_end': [17722, 18665, 19749, 20254],
                'strand': [-1] * 4,
                'evalue': [np.nan] * 4,
                'type_elt': ['protein'] * 4,
                'annotation': ['protein'] * 4,
                'model': [np.nan] * 4,
                'distance_2attC': [np.nan] * 4
            },
            index=['ACBA.007.P01_13_2{}'.format(i) for i in range(0, 4)],
            columns=columns)
        proteins = proteins.astype(dtype=dtype)

        integron.integrase = integrase
        integron.attC = attC
        integron.promoter = promoter
        integron.proteins = proteins
        report = results.integrons_report([integron])
        exp_report = pd.read_csv(self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         '{}.integrons'.format(replicon_name))),
                                 sep="\t")
        exp_report = exp_report.astype(dtype=dtype)
        pdt.assert_frame_equal(exp_report, report)
    def test_describe(self):
        replicon_name = "acba.007.p01.13"
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        args = argparse.Namespace()
        args.eagle_eyes = False
        args.local_max = False
        cfg = Config(args)

        integron = Integron(replicon, cfg)

        data_integrase = {
            "pos_beg": 55,
            "pos_end": 1014,
            "strand": 1,
            "evalue": 1.900000e-25,
            "type_elt": "protein",
            "annotation": "intI",
            "model": "intersection_tyr_intI",
            "distance_2attC": np.nan
        }

        id_int = "ACBA.007.P01_13_1"
        integrase = pd.DataFrame(data_integrase,
                                 columns=self.columns,
                                 index=[id_int])
        integrase = integrase.astype(dtype=self.dtype)

        data_attc = {
            "pos_beg": 10,
            "pos_end": 100,
            "strand": -1,
            "evalue": 1.1e-07,
            "type_elt": "attC",
            "annotation": "attC",
            "model": "attc_4",
            "distance_2attC": np.nan
        }

        attC = pd.DataFrame(data_attc,
                            columns=self.columns,
                            index=['attc_001'])
        attC = attC.astype(dtype=self.dtype)
        promoter = pd.DataFrame(data_attc,
                                columns=self.columns,
                                index=['prom_001'])
        promoter = promoter.astype(dtype=self.dtype)
        attI = pd.DataFrame(data_attc,
                            columns=self.columns,
                            index=['attI_001'])
        attI = attI.astype(dtype=self.dtype)
        proteins = pd.DataFrame(data_attc,
                                columns=self.columns,
                                index=['prot_001'])
        proteins = proteins.astype(dtype=self.dtype)

        excp_description = pd.concat(
            [integrase, attC, promoter, attI, proteins], ignore_index=False)
        excp_description = excp_description.reset_index()
        excp_description.columns = ["element"] + list(
            excp_description.columns[1:])
        excp_description["type"] = "complete"
        excp_description["ID_replicon"] = replicon.id
        excp_description["ID_integron"] = id(
            integron)  # uniq identifier of a given Integron
        excp_description["default"] = "Yes"
        excp_description["considered_topology"] = replicon.topology
        excp_description.drop_duplicates(subset=["element"], inplace=True)

        self.cfg._args.eagle_eyes = False
        self.cfg._args.eagle_eyes = False
        integron.integrase = integrase
        integron.attC = attC
        integron.promoter = promoter
        integron.attI = attI
        integron.proteins = proteins

        recieved_description = integron.describe()
        pdt.assert_frame_equal(recieved_description, excp_description)
    def test_add_proteins(self):
        replicon_name = 'pssu.001.c01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self._data_dir,
                                 '{}.prt.short'.format(replicon_name))

        args = argparse.Namespace()
        args.gembase = False
        args.annot_parser_name = None
        cfg = Config(args)
        integron = Integron(replicon, cfg)

        data_attc = {
            "pos_beg":
            [3072863, 3073496, 3074121, 3075059, 3075593, 3076281, 3076659],
            "pos_end":
            [3072931, 3073555, 3074232, 3075118, 3075652, 3076340, 3076718],
            "strand": [-1] * 7,
            "evalue":
            [2.5e-06, 7e-08, 6.5e-08, 3.2e-06, 4.1e-07, 1.4e-08, 4e-08],
            "type_elt": ['attC'] * 7,
            "annotation": ['attC'] * 7,
            "model": ['attc_4'] * 7,
            "distance_2attC":
            [np.nan, 565.0, 566.0, 827.0, 475.0, 629.0, 319.0]
        }

        attC = pd.DataFrame(data_attc,
                            columns=self.columns,
                            index=[
                                'attc_00{}'.format(i)
                                for i in range(len(data_attc['pos_beg']))
                            ])
        attC = attC.astype(dtype=self.dtype)

        integron.attC = attC
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        integron.add_proteins(prot_db)

        exp_proteins = pd.DataFrame(
            {
                'pos_beg': [3071974, 3072950, 3074243, 3076720],
                'pos_end': [3072855, 3073468, 3075055, 3077511],
                'strand': [-1] * 4,
                'evalue': [np.nan] * 4,
                'type_elt': ['protein'] * 4,
                'annotation': ['protein'] * 4,
                'model': ['NA'] * 4,
                'distance_2attC': [np.nan] * 4
            },
            index=['PSSU.001.C01_13_281{}'.format(i) for i in range(5, 9)],
            columns=self.columns)
        exp_proteins = exp_proteins.astype(dtype=self.dtype)
        pdt.assert_frame_equal(exp_proteins.sort_index(),
                               integron.proteins.sort_index())
    def test_attI(self):
        replicon_name = 'saen.040.p01.10'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        attC = pd.DataFrame(
            {
                'pos_beg': [104651, 105162, 106018, 107567, 108423, 108743],
                'pos_end': [104710, 105221, 106087, 107626, 108482, 108832],
                'strand': [-1] * 6,
                'evalue': [
                    3.400000e-06, 7.500000e-09, 6.800000e-06, 2.800000e-07,
                    6.600000e-06, 1.800000e-04
                ],
                'type_elt': ['attC'] * 6,
                'annotation': ['attC'] * 6,
                'model': ['attc_4'] * 6,
                'distance_2attC': [np.nan, 452.0, 797.0, 1480.0, 797.0, 261.0]
            },
            index=['attc_00{}'.format(i) for i in range(1, 7)],
            columns=self.columns)
        attC = attC.astype(dtype=self.dtype)

        integrase = pd.DataFrame(
            {
                'pos_beg': 109469,
                'pos_end': 110482,
                'strand': 1,
                'evalue': 1.600000e-24,
                'type_elt': 'protein',
                'annotation': 'intI',
                'model': 'intersection_tyr_intI',
                'distance_2attC': np.nan
            },
            index=['SAEN.040.P01_10_135'],
            columns=self.columns)
        integrase = integrase.astype(dtype=self.dtype)

        ##########################################
        # test promoter with attC with integrase #
        ##########################################
        integron = Integron(replicon, self.cfg)
        integron.attC = attC
        integron.integrase = integrase

        exp_attI = pd.DataFrame(
            {
                'pos_beg': [109330],
                'pos_end': [109388],
                'strand': [-1],
                'evalue': [np.nan],
                'type_elt': 'attI',
                'annotation': 'attI_1',
                'model': 'NA',
                'distance_2attC': [np.nan]
            },
            index=['attI1'],
            columns=self.columns)
        exp_attI = exp_attI.astype(dtype=self.dtype)

        integron.add_attI()

        pdt.assert_frame_equal(exp_attI, integron.attI)

        #############################################
        # test promoter with attC without integrase #
        #############################################
        integron = Integron(replicon, self.cfg)
        integron.attC = attC

        empty_attI = pd.DataFrame(columns=self.columns)
        empty_attI = empty_attI.astype(dtype=self.dtype)

        integron.add_attI()

        pdt.assert_frame_equal(empty_attI, integron.attI)

        #############################################
        # test promoter without attC with integrase #
        #############################################
        integron = Integron(replicon, self.cfg)
        integron.integrase = integrase

        integron.add_attI()

        pdt.assert_frame_equal(exp_attI, integron.attI)
    def test_add_promoter(self):
        replicon_name = 'saen.040.p01.10'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        ## integron_finder.SIZE_REPLICON = 148711
        prot_file = os.path.join(self._data_dir, 'Proteins',
                                 '{}.prt'.format(replicon_name))

        # to test promoter we need to ad attC and integrase first
        # as add_promoter use attc and integrase
        attC = pd.DataFrame(
            {
                'pos_beg': [104651, 105162, 106018, 107567, 108423, 108743],
                'pos_end': [104710, 105221, 106087, 107626, 108482, 108832],
                'strand': [-1] * 6,
                'evalue': [
                    3.400000e-06, 7.500000e-09, 6.800000e-06, 2.800000e-07,
                    6.600000e-06, 1.800000e-04
                ],
                'type_elt': ['attC'] * 6,
                'annotation': ['attC'] * 6,
                'model': ['attc_4'] * 6,
                'distance_2attC': [np.nan, 452.0, 797.0, 1480.0, 797.0, 261.0]
            },
            index=['attc_00{}'.format(i) for i in range(1, 7)],
            columns=self.columns)
        attC = attC.astype(dtype=self.dtype)

        integrase = pd.DataFrame(
            {
                'pos_beg': 109469,
                'pos_end': 110482,
                'strand': 1,
                'evalue': 1.600000e-24,
                'type_elt': 'protein',
                'annotation': 'intI',
                'model': 'intersection_tyr_intI',
                'distance_2attC': np.nan
            },
            index=['SAEN.040.P01_10_135'],
            columns=self.columns)
        integrase = integrase.astype(dtype=self.dtype)

        ##########################################
        # test promoter with attC with integrase #
        ##########################################
        integron = Integron(replicon, self.cfg)
        integron.attC = attC
        integron.integrase = integrase

        integron.add_promoter()

        exp_promoters = pd.DataFrame(
            {
                'pos_beg': [109413, 109439],
                'pos_end': [109447, 109465],
                'strand': [1, -1],
                'evalue': [np.nan] * 2,
                'type_elt': ['Promoter'] * 2,
                'annotation': ['Pint_1', 'Pc_1'],
                'model': ['NA'] * 2,
                'distance_2attC': [np.nan] * 2
            },
            index=['P_intI1', 'Pc_int1'],
            columns=self.columns)
        exp_promoters = exp_promoters.astype(dtype=self.dtype)

        pdt.assert_frame_equal(exp_promoters, integron.promoter)

        #############################################
        # test promoter with attC without integrase #
        #############################################
        integron = Integron(replicon, self.cfg)
        integron.attC = attC
        integron.add_promoter()

        empty_promoter = pd.DataFrame(columns=self.columns)
        empty_promoter = empty_promoter.astype(dtype=self.dtype)

        pdt.assert_frame_equal(empty_promoter, integron.promoter)

        #############################################
        # test promoter without attC with integrase #
        #############################################
        integron = Integron(replicon, self.cfg)
        integron.integrase = integrase

        integron.add_promoter()

        pdt.assert_frame_equal(exp_promoters, integron.promoter)
    def test_type(self):
        replicon = SeqRecord(Seq.Seq(''), id='foo')
        no_integrase = Integron(replicon, self.cfg)
        self.assertIsNone(no_integrase.type())

        replicon = SeqRecord(Seq.Seq(''), id='just_one_integrase')
        just_one_integrase = Integron(replicon, self.cfg)
        just_one_integrase.add_integrase(10, 100, 'foo', 1, 1e-2,
                                         "intersection_tyr_intI")
        self.assertEqual(just_one_integrase.type(), "In0")

        replicon = SeqRecord(Seq.Seq(''), id='just_one_attC')
        just_one_attC = Integron(replicon, self.cfg)
        just_one_attC.add_attC(10, 100, 1, 1e-2, "intersection_tyr_intI")
        self.assertEqual(just_one_attC.type(), "CALIN")

        replicon = SeqRecord(Seq.Seq(''), id='one_integrase_one_attC')
        one_integrase_one_attC = Integron(replicon, self.cfg)
        one_integrase_one_attC.add_integrase(10, 100, 'foo', 1, 1e-2,
                                             "intersection_tyr_intI")
        one_integrase_one_attC.add_attC(10, 100, 1, 1e-2,
                                        "intersection_tyr_intI")
        self.assertEqual(one_integrase_one_attC.type(), "complete")
    def test_annot_multi(self):
        """
        Test func_annot when there are 4 integrons:
        - 1 calin with 4 proteins, 2 having a resfam annotation
        - 1 calin with 2 proteins, none having a resfam annotation
        - 1 in0
        - 1 complete with 4 proteins, 3 having a resfam annotation
        """
        # resfam pour: 16, 13, 3, 12

        # Create integron in0
        integron1 = Integron(self.replicon.name, self.cfg)
        integron1.add_integrase(56, 1014, "ACBA.007.P01_13_1", 1, 1.9e-25, "intersection_tyr_intI")

        # Create integron CALIN with resfam proteins
        integron2 = Integron(self.replicon, self.cfg)
        integron2.add_attC(7400, 7650, -1, 7e-9, "attc_4")
        integron2.add_attC(8600, 8650, -1, 7e-4, "attc_4")
        integron2.add_attC(10200, 10400, -1, 7e-7, "attc_4")
        integron2.add_attC(10800, 10900, -1, 7e-7, "attc_4")
        integron2.add_proteins(self.prot_db)

        # Create integron CALIN without any resfam proteins
        integron3 = Integron(self.replicon, self.cfg)
        integron3.add_attC(4320, 4400, -1, 7e-9, "attc_4")
        integron3.add_proteins(self.prot_db)

        # Create complete integron
        integron4 = Integron(self.replicon, self.cfg)
        integron4.add_attC(17825, 17884, -1, 7e-9, "attc_4")
        integron4.add_attC(19080, 19149, -1, 7e-4, "attc_4")
        integron4.add_attC(19618, 19726, -1, 7e-7, "attc_4")
        integron4.add_integrase(16542, 17381, "ACBA.007.P01_13_19", -1, 1.9e-25, "intersection_tyr_intI")
        integron4.add_proteins(self.prot_db)

        integrons = [integron1, integron2, integron3, integron4]

        # Create dataframes for expected proteins before annotation
        proteins1 = pd.DataFrame(columns=["pos_beg", "pos_end", "strand",
                                          "evalue", "type_elt", "model",
                                          "distance_2attC", "annotation"])
        proteins1 = proteins1.astype(dtype={"pos_beg": "int", "pos_end": "int", "strand": "int",
                                            "evalue": "float", "type_elt": "str", "model": "str",
                                            "distance_2attC": "float", "annotation": "str"})
        proteins1 = proteins1[["pos_beg", "pos_end", "strand", "evalue", "type_elt",
                               "model", "distance_2attC", "annotation"]]
        proteins1 = proteins1.astype(dtype=self.prot_dtype)

        proteins2 = pd.DataFrame({"pos_beg": [7088, 7710, 8650, 10524],
                                  "pos_end": [7351, 8594, 10125, 11699],
                                  "strand": [1, -1, -1, -1],
                                  "evalue": [np.nan] * 4,
                                  "type_elt": ["protein"] * 4,
                                  "model": ["NA"] * 4,
                                  "distance_2attC": [np.nan] * 4,
                                  "annotation": ["protein"] * 4},
                                 index=["ACBA.007.P01_13_11", "ACBA.007.P01_13_12",
                                        "ACBA.007.P01_13_13", "ACBA.007.P01_13_14"])
        proteins2 = proteins2[["pos_beg", "pos_end", "strand", "evalue", "type_elt",
                               "model", "distance_2attC", "annotation"]]
        proteins2 = proteins2.astype(dtype=self.prot_dtype)

        proteins3 = pd.DataFrame({"pos_beg": [3546, 4380],
                                  "pos_end": [4313, 4721],
                                  "strand": [1, 1],
                                  "evalue": [np.nan] * 2,
                                  "type_elt": ["protein"] * 2,
                                  "model": ["NA"] * 2,
                                  "distance_2attC": [np.nan] * 2,
                                  "annotation": ["protein"] * 2},
                                 index=["ACBA.007.P01_13_6", "ACBA.007.P01_13_7"])
        proteins3 = proteins3[["pos_beg", "pos_end", "strand", "evalue", "type_elt",
                               "model", "distance_2attC", "annotation"]]
        proteins3 = proteins3.astype(dtype=self.prot_dtype)

        proteins4 = pd.DataFrame({"pos_beg": [17375, 17886, 19090, 19721],
                                  "pos_end": [17722, 18665, 19749, 20254],
                                  "strand": [-1] * 4,
                                  "evalue": [np.nan] * 4,
                                  "type_elt": ["protein"] * 4,
                                  "model": ["NA"] * 4,
                                  "distance_2attC": [np.nan] * 4,
                                  "annotation": ["protein"] * 4},
                                 index=["ACBA.007.P01_13_20", "ACBA.007.P01_13_21",
                                        "ACBA.007.P01_13_22", "ACBA.007.P01_13_23"])
        proteins4 = proteins4[["pos_beg", "pos_end", "strand", "evalue", "type_elt",
                               "model", "distance_2attC", "annotation"]]
        proteins4 = proteins4.astype(dtype=self.prot_dtype)

        # Check proteins before annotation
        expected_proteins = [proteins1, proteins2, proteins3, proteins4]

        for inte, exp_prot in zip(integrons, expected_proteins):
            # we need to sort the dataframe
            # as protein file is parse using biopython and index
            # the order os sequences is not guarantee
            pdt.assert_frame_equal(inte.proteins.sort_index(), exp_prot.sort_index())

        # Annotate proteins with evalue threshold
        func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir, evalue=1e-32)

        # Check that all files generated are as expected
        files_created = [f for f in glob.glob(os.path.join(self.tmp_dir, "*")) if os.path.isfile(f)]
        self.assertEqual(set(self.exp_files), set(files_created))

        # Check that annotated proteins are as expected
        proteins2.loc["ACBA.007.P01_13_13"] = [8650, 10125, -1, 2.4e-86, "protein",
                                               "RF0007", np.nan, "ABC_efflux"]
        proteins4.loc["ACBA.007.P01_13_21"] = [17886, 18665, -1, 7.4e-168, "protein",
                                               "RF0027", np.nan, "ANT3"]
        proteins4.loc["ACBA.007.P01_13_23"] = [19721, 20254, -1, 6.2e-110, "protein",
                                               "RF0003", np.nan, "AAC3-I"]
        for inte, prots in zip(integrons, expected_proteins):
            # we need to sort the dataframe
            # as protein file is parse using biopython and index
            # the order os sequences is not guarantee
            pdt.assert_frame_equal(inte.proteins.sort_index(), prots.sort_index())

        # Annotate proteins with default evalue (1 more annotation)
        with self.catch_io(out=True):
            func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir)
        proteins4.loc["ACBA.007.P01_13_20"] = [17375, 17722, -1, 4.5e-31, "protein",
                                               "RF0066", np.nan, "emrE"]
        for inte, prots in zip(integrons, expected_proteins):
            pdt.assert_frame_equal(inte.proteins.sort_index(), prots.sort_index())

        # Annotate proteins with lower coverage threshold (1 more annotation)
        with self.catch_io(out=True):
            func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir, coverage=0.4)

        proteins2.loc["ACBA.007.P01_13_12"] = [7710, 8594, -1, 1.6e-5, "protein",
                                               "RF0033", np.nan, "APH3"]
        for inte, prots in zip(integrons, expected_proteins):
            pdt.assert_frame_equal(inte.proteins.sort_index(), prots.sort_index())