def test_find_integron_calin_threshold(self):
        replicon_name = 'ESCO001.B.00018.P002'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        replicon_results_path = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id)))
        attc_file = os.path.join(replicon_results_path,
                                 '{}_attc_table.res'.format(replicon.id))
        intI_file = os.path.join(replicon_results_path,
                                 '{}_intI.res'.format(replicon.id))
        phageI_file = os.path.join(replicon_results_path,
                                   '{}_phage_int.res'.format(replicon.id))

        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.distance_threshold = 4000
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.local_max = False
        args.gembase = False
        args.union_integrases = False
        args.calin_threshold = 2

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)

        self.assertEqual(len(integrons), 2)

        args.calin_threshold = 3
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
        self.assertEqual(len(integrons), 1)
    def test_find_integrase_gembase(self):
        cfg = Config(self.args)
        self.args.gembase = True
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))

        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon_name + ".prt")

        shutil.copyfile(
            self.find_data(os.path.join('Proteins', replicon.id + ".prt")),
            prot_file)

        integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)

        for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res',
                       '_phage_int_table.res'):
            res = os.path.join(self.tmp_dir, replicon.id + suffix)
            self.assertTrue(os.path.exists(res))
    def test_find_integrase_no_gembase_no_protfile(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 500000

            prot_file = os.path.join(self.tmp_dir, "foo.prt")
            open(prot_file, 'w').close()
            with self.catch_log():
                with self.assertRaises(EmptyFileError) as ctx:
                    integrase.find_integrase(replicon.id, prot_file,
                                             self.tmp_dir, cfg)
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integrase_no_gembase_no_protfile_no_prodigal(self):
        try:
            self.args.hmmsearch = 'foo'
            self.args.gembase = False
            cfg = Config(self.args)
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 500000

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")

            shutil.copyfile(
                self.find_data(os.path.join('Proteins', replicon.id + ".prt")),
                prot_file)

            with self.assertRaises(RuntimeError) as ctx:
                integrase.find_integrase(replicon.id, prot_file, self.tmp_dir,
                                         cfg)
            self.assertTrue(
                re.search(
                    "failed : \[Errno 2\] No such file or directory: 'foo'",
                    str(ctx.exception)))
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integrase_no_gembase_no_protfile_short_seq(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            prot_name = 'ACBA.007.P01_13'
            prot_path = self.find_data(
                os.path.join('Proteins', prot_name + '.prt'))

            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 200

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
            shutil.copyfile(prot_path, prot_file)

            integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)
            for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res',
                           '_phage_int_table.res'):
                res = os.path.join(self.tmp_dir, replicon.id + suffix)
                self.assertTrue(os.path.exists(res))
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integrase_no_gembase_with_protfile_empty(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 200

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
            open(prot_file, 'w').close()
            with self.assertRaises(EmptyFileError) as ctx:
                with self.catch_log():
                    integrase.find_integrase(replicon.id, prot_file,
                                             self.tmp_dir, cfg)
            self.assertTrue(
                re.match(
                    "^The protein file: '.*' is empty cannot perform hmmsearch on it.$",
                    str(ctx.exception)))
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integron_calin_threshold(self):
        replicon_name = 'ESCO001.B.00018.P002'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        replicon_results_path = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                                                            'tmp_{}'.format(replicon.id)))
        attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id))
        intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id))
        phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id))

        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.distance_threshold = 4000
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.local_max = False
        args.gembase = False
        args.union_integrases = False
        args.calin_threshold = 2

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg)

        self.assertEqual(len(integrons), 2)

        args.calin_threshold = 3
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg)
        self.assertEqual(len(integrons), 1)
    def test_find_integrase_gembase_hmmer_error(self):
        self.args.gembase = True
        self.args.cpu = 'foo'
        cfg = Config(self.args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = os.path.join(self._data_dir, 'Replicons',
                                     replicon_name + '.fst')
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
        shutil.copyfile(
            os.path.join(self._data_dir, 'Proteins', replicon.id + ".prt"),
            prot_file)
        with self.assertRaises(RuntimeError) as ctx:
            integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)
        self.assertTrue(str(ctx.exception).endswith('failed return code = 1'))
    def test_find_integrase_gembase_no_hmmer_no_replicon(self):
        self.args.gembase = True
        self.args.hmmsearch = 'foo'
        cfg = Config(self.args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = os.path.join(self._data_dir, 'Replicons',
                                     replicon_name + '.fst')
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")

        with self.catch_log():
            with self.assertRaises(RuntimeError) as ctx:
                integrase.find_integrase(replicon.id, prot_file, self.tmp_dir,
                                         cfg)
            self.assertEqual(
                "The protein file: '{}' does not exists cannot perform hmmsearch on it."
                .format(prot_file), str(ctx.exception))
    def test_find_integron_proteins_n_union_integrase(self):
        replicon_name = 'OBAL001.B.00005.C001'
        replicon_id = 'OBAL001.B.00005.C001'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        result_dir = 'Results_Integron_Finder_{}.union'.format(replicon_name)
        attc_file = self.find_data(
            os.path.join(result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_attc_table.res'.format(replicon.id)))
        intI_file = self.find_data(
            os.path.join(result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(
            os.path.join(result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_phage_int.res'.format(replicon.id)))
        args = argparse.Namespace()
        args.evalue_attc = 1.
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.calin_threshold = 2
        args.attc_model = 'attc_4.cm'
        args.no_proteins = False
        args.keep_palindromes = True
        args.union_integrases = True
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        exp_msg = """In replicon {}, there are:
- 3 complete integron(s) found with a total 4 attC site(s)
- 0 CALIN element(s) found with a total of 0 attC site(s)
- 2 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 5)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)

        exp_int = []
        exp_int.append(
            pd.DataFrame([[
                418072, 419283, 1, 5.400000e-25, 'protein', 'Phage_integrase',
                np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_388'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                434671, 440118, -1, 0.085, 'protein', 'Phage_integrase',
                np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_399'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                516941, 517834, -1, 1.200000e-54, 'protein', 'Phage_integrase',
                np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_472'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                1940269, 1941171, 1, 4.200000e-43, 'protein',
                'Phage_integrase', np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_1793'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                1545830, 1546807, -1, 1.100000e-21, 'protein',
                'intersection_tyr_intI', np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_1416'
                                ]).astype(dtype=self.dtype))

        exp_attC = []
        exp_attC.append(
            pd.DataFrame(
                [[421689, 421764, 1, 0.13, 'attC', 'attc_4', np.nan, 'attC']],
                columns=self.columns,
                index=['attc_001']).astype(dtype=self.dtype))
        exp_attC.append(
            pd.DataFrame([[
                442458, 442514, -1, 7.000000e-07, 'attC', 'attc_4', np.nan,
                'attC'
            ]],
                         columns=self.columns,
                         index=['attc_001']).astype(dtype=self.dtype))
        exp_attC.append(empty)
        exp_attC.append(empty)
        exp_attC.append(
            pd.DataFrame([[
                1547800, 1547859, 1, 0.00049, 'attC', 'attc_4', np.nan, 'attC'
            ], [1548775, 1548834, 1, 0.00009, 'attC', 'attc_4', 916.0, 'attC']
                          ],
                         columns=self.columns,
                         index=['attc_001',
                                'attc_002']).astype(dtype=self.dtype))

        for i, integron in enumerate(integrons):
            self.assertEqual(integron.replicon.name, replicon_id)
            pdt.assert_frame_equal(integron.integrase, exp_int[i])
            pdt.assert_frame_equal(integron.attC, exp_attC[i])
            pdt.assert_frame_equal(integron.promoter, empty)
            pdt.assert_frame_equal(integron.attI, empty)
            pdt.assert_frame_equal(integron.proteins, empty)
    def test_find_integron_proteins_circ_replicon(self):
        replicon_name = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        exp_result_dir = 'Results_Integron_Finder_acba.007.p01.13.circular'
        attc_file = self.find_data(
            os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_attc_table.res'.format(replicon.id)))
        intI_file = self.find_data(
            os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(
            os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_phage_int.res'.format(replicon.id)))
        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.union_integrases = False
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False

        args = argparse.Namespace()
        args.evalue_attc = 1.
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.attc_model = 'attc_4.cm'
        args.no_proteins = False
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False
        args.union_integrases = False
        args.keep_palindromes = True
        args.calin_threshold = 2
        args.local_max = False

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        exp_msg = """In replicon {}, there are:
- 1 complete integron(s) found with a total 3 attC site(s)
- 0 CALIN element(s) found with a total of 0 attC site(s)
- 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 1)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        exp = pd.DataFrame(
            {
                'annotation': 'intI',
                'distance_2attC': np.nan,
                'evalue': 1.900000e-25,
                'model': 'intersection_tyr_intI',
                'pos_beg': 55,
                'pos_end': 1014,
                'strand': 1,
                'type_elt': 'protein'
            },
            columns=self.columns,
            index=['ACBA.007.P01_13_1'])
        exp = exp.astype(dtype=self.dtype)
        pdt.assert_frame_equal(integron.integrase, exp)

        exp = pd.DataFrame(
            {
                'annotation': ['attC'] * 3,
                'distance_2attC': [np.nan, 1196.0, 469.0],
                'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07],
                'model': ['attc_4'] * 3,
                'pos_beg': [17825, 19080, 19618],
                'pos_end': [17884, 19149, 19726],
                'strand': [-1, -1, -1],
                'type_elt': 'attC'
            },
            columns=self.columns,
            index=['attc_001', 'attc_002', 'attc_003'])
        exp = exp.astype(dtype=self.dtype)
        pdt.assert_frame_equal(integron.attC, exp)

        exp = pd.DataFrame(columns=self.columns)
        exp = exp.astype(dtype=self.dtype)

        pdt.assert_frame_equal(integron.promoter, exp)
        pdt.assert_frame_equal(integron.attI, exp)
        pdt.assert_frame_equal(integron.proteins, exp)
    def test_find_integron_attC_is_df(self):
        replicon_name = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        attc_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_attc_table.res'.format(replicon.id)))

        intI_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_phage_int.res'.format(replicon.id)))

        args = argparse.Namespace()
        args.no_proteins = True
        args.keep_palindromes = True
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000
        args.calin_threshold = 2
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        len_model_attc = 47  # length in 'CLEN' (value for model attc_4.cm)

        attc_file = read_infernal(attc_file,
                                  replicon_name,
                                  len_model_attc,
                                  evalue=cfg.evalue_attc,
                                  size_max_attc=cfg.max_attc_size,
                                  size_min_attc=cfg.min_attc_size)
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        exp_msg = """In replicon {}, there are:
- 0 complete integron(s) found with a total 0 attC site(s)
- 1 CALIN element(s) found with a total of 3 attC site(s)
- 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)

        self.assertEqual(len(integrons), 1)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        exp = pd.DataFrame(
            {
                'annotation': ['attC'] * 3,
                'distance_2attC': [np.nan, 1196.0, 469.0],
                'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07],
                'model': ['attc_4'] * 3,
                'pos_beg': [17825, 19080, 19618],
                'pos_end': [17884, 19149, 19726],
                'strand': [-1, -1, -1],
                'type_elt': 'attC'
            },
            columns=self.columns,
            index=['attc_001', 'attc_002', 'attc_003'])
        pdt.assert_frame_equal(integron.attC, exp)

        exp = pd.DataFrame(columns=self.columns)
        exp = exp.astype(dtype=self.dtype)

        pdt.assert_frame_equal(integron.integrase, exp)
        pdt.assert_frame_equal(integron.promoter, exp)
        pdt.assert_frame_equal(integron.attI, exp)
        pdt.assert_frame_equal(integron.proteins, exp)
    def test_find_integron(self):
        replicon_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(os.path.join('Proteins', prot_name + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        replicon_results_path = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                                                            'tmp_{}'.format(replicon.id)))
        attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id))
        intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id))
        phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id))

        args = argparse.Namespace()
        args.no_proteins = True
        args.keep_palindromes = True
        args.distance_threshold = 4000
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.calin_threshold = 2
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        exp_msg = """In replicon {}, there are:
- 0 complete integron(s) found with a total 0 attC site(s)
- 1 CALIN element(s) found with a total of 3 attC site(s)
- 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg)
            catch_msg = log.get_value().strip()

        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 1)
        integron = integrons[0]
        self.assertEqual(integron.replicon.id, replicon.id)

        exp = pd.DataFrame({'annotation': ['attC'] * 3,
                            'distance_2attC': [np.nan, 1196.0, 469.0],
                            'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07],
                            'model': ['attc_4'] * 3,
                            'pos_beg': [17825, 19080, 19618],
                            'pos_end': [17884, 19149, 19726],
                            'strand': [-1, -1, -1],
                            'type_elt': 'attC'},
                           columns=self.columns,
                           index=['attc_001', 'attc_002', 'attc_003'])
        pdt.assert_frame_equal(integron.attC, exp)

        exp = pd.DataFrame(columns=self.columns,)
        exp = exp.astype(dtype=self.dtype)

        pdt.assert_frame_equal(integron.integrase, exp)
        pdt.assert_frame_equal(integron.promoter, exp)
        pdt.assert_frame_equal(integron.attI, exp)
        pdt.assert_frame_equal(integron.proteins, exp)
    def test_find_integron_proteins_n_union_integrase(self):
        replicon_name = 'OBAL001.B.00005.C001'
        replicon_id = 'OBAL001.B.00005.C001'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        result_dir = 'Results_Integron_Finder_{}.union'.format(replicon_name)
        attc_file = self.find_data(os.path.join(result_dir,
                                                'tmp_{}'.format(replicon.id),
                                                '{}_attc_table.res'.format(replicon.id)))
        intI_file = self.find_data(os.path.join(result_dir,
                                                'tmp_{}'.format(replicon.id),
                                                '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(os.path.join(result_dir,
                                                  'tmp_{}'.format(replicon.id),
                                                  '{}_phage_int.res'.format(replicon.id)))
        args = argparse.Namespace()
        args.evalue_attc = 1.
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.calin_threshold = 2
        args.attc_model = 'attc_4.cm'
        args.no_proteins = False
        args.keep_palindromes = True
        args.union_integrases = True
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        exp_msg = """In replicon {}, there are:
- 3 complete integron(s) found with a total 4 attC site(s)
- 0 CALIN element(s) found with a total of 0 attC site(s)
- 2 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon,
                                      prot_db,
                                      attc_file,
                                      intI_file,
                                      phageI_file,
                                      cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 5)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)

        exp_int = []
        exp_int.append(pd.DataFrame(
            [[418072, 419283, 1, 5.400000e-25, 'protein', 'Phage_integrase', np.nan, 'intI']],
            columns=self.columns,
            index=['OBAL001.B.00005.C001_388']).astype(dtype=self.dtype))
        exp_int.append(pd.DataFrame(
            [[434671, 440118, -1, 0.085, 'protein', 'Phage_integrase', np.nan, 'intI']],
            columns=self.columns,
            index=['OBAL001.B.00005.C001_399']).astype(dtype=self.dtype))
        exp_int.append(pd.DataFrame(
            [[516941, 517834, -1, 1.200000e-54, 'protein', 'Phage_integrase', np.nan, 'intI']],
            columns=self.columns,
            index=['OBAL001.B.00005.C001_472']).astype(dtype=self.dtype))
        exp_int.append(pd.DataFrame(
            [[1940269, 1941171, 1, 4.200000e-43, 'protein', 'Phage_integrase', np.nan, 'intI']],
            columns=self.columns,
            index=['OBAL001.B.00005.C001_1793']).astype(dtype=self.dtype))
        exp_int.append(pd.DataFrame(
            [[1545830, 1546807, -1, 1.100000e-21, 'protein', 'intersection_tyr_intI', np.nan, 'intI']],
            columns=self.columns,
            index=['OBAL001.B.00005.C001_1416']).astype(dtype=self.dtype))

        exp_attC = []
        exp_attC.append(pd.DataFrame(
            [[421689, 421764, 1, 0.13, 'attC', 'attc_4', np.nan, 'attC']],
            columns=self.columns,
            index=['attc_001']).astype(dtype=self.dtype))
        exp_attC.append(pd.DataFrame(
            [[442458, 442514, -1, 7.000000e-07, 'attC', 'attc_4', np.nan, 'attC']],
            columns=self.columns,
            index=['attc_001']).astype(dtype=self.dtype))
        exp_attC.append(empty)
        exp_attC.append(empty)
        exp_attC.append(pd.DataFrame(
            [[1547800, 1547859, 1, 0.00049, 'attC', 'attc_4', np.nan, 'attC'],
             [1548775, 1548834, 1, 0.00009, 'attC', 'attc_4', 916.0, 'attC']],
            columns=self.columns,
            index=['attc_001', 'attc_002']).astype(dtype=self.dtype))

        for i, integron in enumerate(integrons):
            self.assertEqual(integron.replicon.name, replicon_id)
            pdt.assert_frame_equal(integron.integrase, exp_int[i])
            pdt.assert_frame_equal(integron.attC, exp_attC[i])
            pdt.assert_frame_equal(integron.promoter, empty)
            pdt.assert_frame_equal(integron.attI, empty)
            pdt.assert_frame_equal(integron.proteins, empty)
    def test_find_integron_proteins_lin_replicon(self):
        replicon_name = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(os.path.join('Proteins', replicon_id + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        exp_result_dir = 'Results_Integron_Finder_acba.007.p01.13.linear'
        attc_file = self.find_data(os.path.join(exp_result_dir,
                                                'tmp_{}'.format(replicon.id),
                                                '{}_attc_table.res'.format(replicon.id)))
        intI_file = self.find_data(os.path.join(exp_result_dir,
                                                'tmp_{}'.format(replicon.id),
                                                '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(os.path.join(exp_result_dir,
                                                  'tmp_{}'.format(replicon.id),
                                                  '{}_phage_int.res'.format(replicon.id)))
        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.union_integrases = False
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False

        args = argparse.Namespace()
        args.evalue_attc = 1.
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.attc_model = 'attc_4.cm'
        args.no_proteins = False
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False
        args.union_integrases = False
        args.keep_palindromes = True
        args.calin_threshold = 2
        args.local_max = False

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        exp_msg = """In replicon {}, there are:
- 0 complete integron(s) found with a total 0 attC site(s)
- 1 CALIN element(s) found with a total of 3 attC site(s)
- 1 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon,
                                      prot_db,
                                      attc_file,
                                      intI_file,
                                      phageI_file,
                                      cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 2)

        exp_int = []
        exp = pd.DataFrame({'annotation': 'intI',
                            'distance_2attC': np.nan,
                            'evalue':  1.900000e-25,
                            'model': 'intersection_tyr_intI',
                            'pos_beg': 55,
                            'pos_end': 1014,
                            'strand': 1,
                            'type_elt': 'protein'},
                           columns=self.columns,
                           index=['ACBA.007.P01_13_1'])
        exp = exp.astype(dtype=self.dtype)
        exp_int.append(exp)
        exp_int.append(pd.DataFrame(columns=self.columns).astype(dtype=self.dtype))

        exp_attC = [pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)]
        exp = pd.DataFrame({'annotation': ['attC'] * 3,
                            'distance_2attC': [np.nan, 1196.0,  469.0],
                            'evalue':  [1.000000e-09, 1.000000e-04, 1.100000e-07],
                            'model': ['attc_4'] * 3,
                            'pos_beg': [17825, 19080, 19618],
                            'pos_end': [17884, 19149, 19726],
                            'strand': [-1, -1, -1],
                            'type_elt': 'attC'},
                           columns=self.columns,
                           index=['attc_001', 'attc_002', 'attc_003'])
        exp = exp.astype(dtype=self.dtype)
        exp_attC.append(exp)
        empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)

        for i, integron in enumerate(integrons):
            self.assertEqual(integron.replicon.name, replicon_id)
            pdt.assert_frame_equal(integron.integrase, exp_int[i])
            pdt.assert_frame_equal(integron.attC, exp_attC[i])

            pdt.assert_frame_equal(integron.promoter, empty)
            pdt.assert_frame_equal(integron.attI, empty)
            pdt.assert_frame_equal(integron.proteins, empty)