def test_get_description(self):
        # SeqDesc(id, strand, strat, stop)
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)

        descriptions = {'ACBA.007.P01_13_23': SeqDesc('ACBA.007.P01_13_23', -1, 19721, 20254),
                        'ACBA.007.P01_13_1':  SeqDesc('ACBA.007.P01_13_1', 1, 55, 1014)}
        for seq_id, desc in descriptions.items():
            self.assertEqual(desc, db.get_description(seq_id))
    def test_read_hmm_evalue(self):
        """
        Test that the hmm hits are well read, and returned only if evalue is < to the
        given threshold.
        """
        rep_name = "acba.007.p01.13"
        replicon_id = 'ACBA.007.P01_13'

        replicon_path = self.find_data(
            os.path.join('Replicons', rep_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))

        args = argparse.Namespace()
        args.gembase = False
        args.replicon = replicon_path
        cfg = Config(args)

        sequences_db = read_multi_prot_fasta(replicon_path)
        replicon = next(sequences_db)
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        infile = self.find_data(
            os.path.join("Results_Integron_Finder_{}".format(rep_name),
                         "tmp_{}".format(replicon_id),
                         "{}_intI.res".format(replicon_id)))

        df1 = read_hmm(rep_name, prot_db, infile, cfg, evalue=1.95e-25)
        exp1 = pd.DataFrame(data={
            "Accession_number": rep_name,
            "query_name": "intI_Cterm",
            "ID_query": "-",
            "ID_prot": "ACBA.007.P01_13_1",
            "strand": 1,
            "pos_beg": 55,
            "pos_end": 1014,
            "evalue": 1.9e-25
        },
                            index=[0])
        exp1 = exp1[[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ]]
        pdt.assert_frame_equal(df1, exp1)

        df2 = read_hmm(replicon_id, prot_db, infile, cfg, evalue=1.9e-25)
        exp2 = pd.DataFrame(columns=[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ])

        intcols = ["pos_beg", "pos_end", "strand"]
        floatcol = ["evalue"]
        exp2[intcols] = exp2[intcols].astype(int)
        exp2[floatcol] = exp2[floatcol].astype(float)
        pdt.assert_frame_equal(df2, exp2)
Beispiel #3
0
    def test_get_description(self):
        # SeqDesc(id, strand, strat, stop)
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)

        descriptions = {
            'ACBA.007.P01_13_23': SeqDesc('ACBA.007.P01_13_23', -1, 19721,
                                          20254),
            'ACBA.007.P01_13_1': SeqDesc('ACBA.007.P01_13_1', 1, 55, 1014)
        }
        for seq_id, desc in descriptions.items():
            self.assertEqual(desc, db.get_description(seq_id))
Beispiel #4
0
    def test_make_protfile_no_prodigal(self):
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        self.args.prodigal = 'foo_bar'
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path

        with self.assertRaises(RuntimeError) as ctx:
            ProdigalDB(replicon, cfg)
Beispiel #5
0
    def test_ProteinDB(self):
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        self.assertTrue(db.replicon.id, replicon.id)
    def test_find_integron_calin_threshold(self):
        replicon_name = 'ESCO001.B.00018.P002'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        replicon_results_path = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id)))
        attc_file = os.path.join(replicon_results_path,
                                 '{}_attc_table.res'.format(replicon.id))
        intI_file = os.path.join(replicon_results_path,
                                 '{}_intI.res'.format(replicon.id))
        phageI_file = os.path.join(replicon_results_path,
                                   '{}_phage_int.res'.format(replicon.id))

        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.distance_threshold = 4000
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.local_max = False
        args.gembase = False
        args.union_integrases = False
        args.calin_threshold = 2

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)

        self.assertEqual(len(integrons), 2)

        args.calin_threshold = 3
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
        self.assertEqual(len(integrons), 1)
Beispiel #7
0
    def test_ProteinDB_no_prodigal(self):
        file_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        self.args.prodigal = None
        with self.assertRaises(RuntimeError) as ctx:
            ProdigalDB(replicon, cfg)
Beispiel #8
0
    def test_add_proteins(self):
        replicon_name = 'pssu.001.c01.13'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self._data_dir,
                                 '{}.prt.short'.format(replicon_name))

        args = argparse.Namespace()
        args.gembase = False
        args.annot_parser_name = None
        cfg = Config(args)
        integron = Integron(replicon, cfg)

        data_attc = {"pos_beg": [3072863, 3073496, 3074121, 3075059, 3075593, 3076281, 3076659],
                     "pos_end": [3072931, 3073555, 3074232, 3075118, 3075652, 3076340, 3076718],
                     "strand": [-1] * 7,
                     "evalue": [2.5e-06, 7e-08, 6.5e-08, 3.2e-06, 4.1e-07, 1.4e-08, 4e-08],
                     "type_elt": ['attC'] * 7,
                     "annotation": ['attC'] * 7,
                     "model": ['attc_4'] * 7,
                     "distance_2attC": [np.nan, 565.0, 566.0, 827.0, 475.0, 629.0, 319.0]}

        attC = pd.DataFrame(data_attc,
                            columns=self.columns,
                            index=['attc_00{}'.format(i) for i in range(len(data_attc['pos_beg']))])
        attC = attC.astype(dtype=self.dtype)

        integron.attC = attC
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        integron.add_proteins(prot_db)

        exp_proteins = pd.DataFrame({'pos_beg': [3071974, 3072950, 3074243, 3076720],
                                     'pos_end': [3072855, 3073468, 3075055, 3077511],
                                     'strand': [-1] * 4,
                                     'evalue': [np.nan] * 4,
                                     'type_elt': ['protein'] * 4,
                                     'annotation': ['protein'] * 4,
                                     'model': ['NA'] * 4,
                                     'distance_2attC': [np.nan] *4
                                     },
                                    index=['PSSU.001.C01_13_281{}'.format(i) for i in range(5, 9)],
                                    columns=self.columns
                                    )
        exp_proteins = exp_proteins.astype(dtype=self.dtype)
        pdt.assert_frame_equal(exp_proteins.sort_index(), integron.proteins.sort_index())
Beispiel #9
0
    def test_protfile(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        self.assertEqual(os.path.join(cfg.tmp_dir(replicon.id), prot_name),
                         db.protfile)
Beispiel #10
0
    def test_iter(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        idx = SeqIO.index(self.find_data(os.path.join('Proteins', prot_name)),
                          'fasta',
                          alphabet=Seq.IUPAC.extended_protein)
        for exp_seq_id, get_seq_id in zip(idx, db):
            self.assertEqual(exp_seq_id, get_seq_id)
    def test_read_hmm(self):
        """
        Test that the hmm hits are well read
        """
        rep_name = "acba.007.p01.13"
        replicon_id = 'ACBA.007.P01_13'

        replicon_path = self.find_data(
            os.path.join('Replicons', rep_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))

        args = argparse.Namespace()
        args.gembase = False
        args.replicon = replicon_path
        cfg = Config(args)

        sequences_db = read_multi_prot_fasta(replicon_path)
        replicon = next(sequences_db)
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        infile = self.find_data(
            os.path.join("Results_Integron_Finder_{}".format(rep_name),
                         "tmp_{}".format(replicon_id),
                         "{}_intI.res".format(replicon_id)))

        df = read_hmm(rep_name, prot_db, infile, cfg)
        exp = pd.DataFrame(data={
            "Accession_number": rep_name,
            "query_name": "intI_Cterm",
            "ID_query": "-",
            "ID_prot": "ACBA.007.P01_13_1",
            "strand": 1,
            "pos_beg": 55,
            "pos_end": 1014,
            "evalue": 1.9e-25
        },
                           index=[0])
        exp = exp[[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ]]
        pdt.assert_frame_equal(df, exp)
    def test_read_hmm_cov2(self):
        """
        Test that the hmm hits are well read, it returns only the hits with coverage >
        given threshold
        """
        rep_name = "acba.007.p01.13"
        replicon_id = 'ACBA.007.P01_13'

        replicon_path = self.find_data(
            os.path.join('Replicons', rep_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))

        args = argparse.Namespace()
        args.gembase = False
        args.replicon = replicon_path
        cfg = Config(args)

        sequences_db = read_multi_prot_fasta(replicon_path)
        replicon = next(sequences_db)
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        infile = self.find_data(
            os.path.join("fictive_results", "{}_intI.res".format(replicon_id)))

        df1 = read_hmm(rep_name, prot_db, infile, cfg, coverage=0.7)
        exp1 = pd.DataFrame(data={
            "Accession_number": [rep_name] * 2,
            "query_name": ["intI_Cterm"] * 2,
            "ID_query": ["-", "-"],
            "ID_prot": ["ACBA.007.P01_13_1", "ACBA.007.P01_13_2"],
            "strand": [1, -1],
            "pos_beg": [55, 905],
            "pos_end": [1014, 1609],
            "evalue": [1.9e-25, 1e-3]
        },
                            index=[0, 1])
        exp1 = exp1[[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ]]
        pdt.assert_frame_equal(df1, exp1)
Beispiel #13
0
    def test_make_protfile_no_dir(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path

        db = ProdigalDB(replicon, cfg)
        for seq_nb, seqs in enumerate(
                zip(
                    read_multi_prot_fasta(
                        self.find_data(os.path.join('Proteins', prot_name))),
                    read_multi_prot_fasta(db.protfile)), 1):
            expected, test = seqs
            self.assertEqual(expected.id, test.id)
        self.assertEqual(seq_nb, 23)
Beispiel #14
0
    def setUp(self):
        """
        Define variables common to all tests
        """
        self.replicon_path = self.find_data(
            os.path.join('Replicons', "acba.007.p01.13.fst"))
        self.replicon_id = 'ACBA.007.P01_13'
        topologies = Topology('lin')
        with FastaIterator(self.replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            self.seq = next(sequences_db)

        self.prot_file = self.find_data(
            os.path.join("Results_Integron_Finder_acba.007.p01.13",
                         "tmp_{}".format(self.replicon_id),
                         "{}.prt".format(self.replicon_id)))
        args = argparse.Namespace()
        cfg = Config(args)
        self.prot_db = ProdigalDB(self.seq, cfg, prot_file=self.prot_file)
        self.dist_threshold = 4000
Beispiel #15
0
    def test_getitem(self):
        file_name = 'acba.007.p01.13'
        prot_name = 'ACBA.007.P01_13.prt'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        self.args.replicon = replicon_path
        cfg = Config(self.args)
        seq_db = read_multi_prot_fasta(replicon_path)
        replicon = next(seq_db)
        replicon.path = replicon_path
        os.makedirs(cfg.tmp_dir(replicon.id))

        db = ProdigalDB(replicon, cfg)
        exp = read_multi_prot_fasta(
            self.find_data(os.path.join('Proteins', prot_name)))
        for prot_expected in exp:
            prot_received = db[prot_expected.id]
            self.assertEqual(prot_received.id, prot_expected.id)
            self.assertEqual(prot_received.seq, prot_expected.seq)
        with self.assertRaises(KeyError) as ctx:
            db['nimport_naoik']
        self.assertEqual(str(ctx.exception), "'nimport_naoik'")
    def test_read_empty(self):
        """
        Test that when there are no hits in the hmm result file, it returns an empty
        dataframe, without error.
        """
        rep_name = "acba.007.p01.13"
        replicon_id = 'ACBA.007.P01_13'

        replicon_path = self.find_data(
            os.path.join('Replicons', rep_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))

        args = argparse.Namespace()
        args.gembase = False
        args.replicon = replicon_path
        cfg = Config(args)

        sequences_db = read_multi_prot_fasta(replicon_path)
        replicon = next(sequences_db)
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        infile = self.find_data(
            os.path.join("fictive_results",
                         "{}_intI-empty.res".format(replicon_id)))

        df = read_hmm(rep_name, prot_db, infile, cfg)
        exp = pd.DataFrame(columns=[
            "Accession_number", "query_name", "ID_query", "ID_prot", "strand",
            "pos_beg", "pos_end", "evalue"
        ])

        intcols = ["pos_beg", "pos_end", "strand"]
        floatcol = ["evalue"]
        exp[intcols] = exp[intcols].astype(int)
        exp[floatcol] = exp[floatcol].astype(float)
        pdt.assert_frame_equal(df, exp)
    def test_find_integron_proteins_n_union_integrase(self):
        replicon_name = 'OBAL001.B.00005.C001'
        replicon_id = 'OBAL001.B.00005.C001'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        result_dir = 'Results_Integron_Finder_{}.union'.format(replicon_name)
        attc_file = self.find_data(
            os.path.join(result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_attc_table.res'.format(replicon.id)))
        intI_file = self.find_data(
            os.path.join(result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(
            os.path.join(result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_phage_int.res'.format(replicon.id)))
        args = argparse.Namespace()
        args.evalue_attc = 1.
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.calin_threshold = 2
        args.attc_model = 'attc_4.cm'
        args.no_proteins = False
        args.keep_palindromes = True
        args.union_integrases = True
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        exp_msg = """In replicon {}, there are:
- 3 complete integron(s) found with a total 4 attC site(s)
- 0 CALIN element(s) found with a total of 0 attC site(s)
- 2 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 5)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)

        exp_int = []
        exp_int.append(
            pd.DataFrame([[
                418072, 419283, 1, 5.400000e-25, 'protein', 'Phage_integrase',
                np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_388'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                434671, 440118, -1, 0.085, 'protein', 'Phage_integrase',
                np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_399'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                516941, 517834, -1, 1.200000e-54, 'protein', 'Phage_integrase',
                np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_472'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                1940269, 1941171, 1, 4.200000e-43, 'protein',
                'Phage_integrase', np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_1793'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                1545830, 1546807, -1, 1.100000e-21, 'protein',
                'intersection_tyr_intI', np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_1416'
                                ]).astype(dtype=self.dtype))

        exp_attC = []
        exp_attC.append(
            pd.DataFrame(
                [[421689, 421764, 1, 0.13, 'attC', 'attc_4', np.nan, 'attC']],
                columns=self.columns,
                index=['attc_001']).astype(dtype=self.dtype))
        exp_attC.append(
            pd.DataFrame([[
                442458, 442514, -1, 7.000000e-07, 'attC', 'attc_4', np.nan,
                'attC'
            ]],
                         columns=self.columns,
                         index=['attc_001']).astype(dtype=self.dtype))
        exp_attC.append(empty)
        exp_attC.append(empty)
        exp_attC.append(
            pd.DataFrame([[
                1547800, 1547859, 1, 0.00049, 'attC', 'attc_4', np.nan, 'attC'
            ], [1548775, 1548834, 1, 0.00009, 'attC', 'attc_4', 916.0, 'attC']
                          ],
                         columns=self.columns,
                         index=['attc_001',
                                'attc_002']).astype(dtype=self.dtype))

        for i, integron in enumerate(integrons):
            self.assertEqual(integron.replicon.name, replicon_id)
            pdt.assert_frame_equal(integron.integrase, exp_int[i])
            pdt.assert_frame_equal(integron.attC, exp_attC[i])
            pdt.assert_frame_equal(integron.promoter, empty)
            pdt.assert_frame_equal(integron.attI, empty)
            pdt.assert_frame_equal(integron.proteins, empty)
    def test_find_integron_proteins_circ_replicon(self):
        replicon_name = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        exp_result_dir = 'Results_Integron_Finder_acba.007.p01.13.circular'
        attc_file = self.find_data(
            os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_attc_table.res'.format(replicon.id)))
        intI_file = self.find_data(
            os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(
            os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_phage_int.res'.format(replicon.id)))
        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.union_integrases = False
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False

        args = argparse.Namespace()
        args.evalue_attc = 1.
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.attc_model = 'attc_4.cm'
        args.no_proteins = False
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False
        args.union_integrases = False
        args.keep_palindromes = True
        args.calin_threshold = 2
        args.local_max = False

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        exp_msg = """In replicon {}, there are:
- 1 complete integron(s) found with a total 3 attC site(s)
- 0 CALIN element(s) found with a total of 0 attC site(s)
- 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 1)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        exp = pd.DataFrame(
            {
                'annotation': 'intI',
                'distance_2attC': np.nan,
                'evalue': 1.900000e-25,
                'model': 'intersection_tyr_intI',
                'pos_beg': 55,
                'pos_end': 1014,
                'strand': 1,
                'type_elt': 'protein'
            },
            columns=self.columns,
            index=['ACBA.007.P01_13_1'])
        exp = exp.astype(dtype=self.dtype)
        pdt.assert_frame_equal(integron.integrase, exp)

        exp = pd.DataFrame(
            {
                'annotation': ['attC'] * 3,
                'distance_2attC': [np.nan, 1196.0, 469.0],
                'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07],
                'model': ['attc_4'] * 3,
                'pos_beg': [17825, 19080, 19618],
                'pos_end': [17884, 19149, 19726],
                'strand': [-1, -1, -1],
                'type_elt': 'attC'
            },
            columns=self.columns,
            index=['attc_001', 'attc_002', 'attc_003'])
        exp = exp.astype(dtype=self.dtype)
        pdt.assert_frame_equal(integron.attC, exp)

        exp = pd.DataFrame(columns=self.columns)
        exp = exp.astype(dtype=self.dtype)

        pdt.assert_frame_equal(integron.promoter, exp)
        pdt.assert_frame_equal(integron.attI, exp)
        pdt.assert_frame_equal(integron.proteins, exp)
    def test_find_integron_attC_is_df(self):
        replicon_name = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        attc_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_attc_table.res'.format(replicon.id)))

        intI_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_phage_int.res'.format(replicon.id)))

        args = argparse.Namespace()
        args.no_proteins = True
        args.keep_palindromes = True
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000
        args.calin_threshold = 2
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        len_model_attc = 47  # length in 'CLEN' (value for model attc_4.cm)

        attc_file = read_infernal(attc_file,
                                  replicon_name,
                                  len_model_attc,
                                  evalue=cfg.evalue_attc,
                                  size_max_attc=cfg.max_attc_size,
                                  size_min_attc=cfg.min_attc_size)
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        exp_msg = """In replicon {}, there are:
- 0 complete integron(s) found with a total 0 attC site(s)
- 1 CALIN element(s) found with a total of 3 attC site(s)
- 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)

        self.assertEqual(len(integrons), 1)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        exp = pd.DataFrame(
            {
                'annotation': ['attC'] * 3,
                'distance_2attC': [np.nan, 1196.0, 469.0],
                'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07],
                'model': ['attc_4'] * 3,
                'pos_beg': [17825, 19080, 19618],
                'pos_end': [17884, 19149, 19726],
                'strand': [-1, -1, -1],
                'type_elt': 'attC'
            },
            columns=self.columns,
            index=['attc_001', 'attc_002', 'attc_003'])
        pdt.assert_frame_equal(integron.attC, exp)

        exp = pd.DataFrame(columns=self.columns)
        exp = exp.astype(dtype=self.dtype)

        pdt.assert_frame_equal(integron.integrase, exp)
        pdt.assert_frame_equal(integron.promoter, exp)
        pdt.assert_frame_equal(integron.attI, exp)
        pdt.assert_frame_equal(integron.proteins, exp)
Beispiel #20
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError(
                "the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                "profile with --path-func-annot option".format(
                    config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning(
            "No hmm profiles for functional annotation detected, skip functional annotation step."
        )

    if config.gembase_path:
        protein_db = GembaseDB(replicon,
                               config,
                               gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir,
                                     replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(
                    phageI_file):
                find_integrase(replicon.id, protein_db.protfile,
                               result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path,
                      replicon.name,
                      config.cmsearch,
                      result_tmp_dir,
                      config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file,
                                  intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(
                    os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(
                    integrons,
                    replicon,
                    config.distance_threshold,
                    config.model_attc_path,
                    max_attc_size=config.max_attc_size,
                    min_attc_size=config.min_attc_size,
                    circular=circular,
                    out_dir=result_tmp_dir,
                    cpu=config.cpu,
                    evalue_attc=config.evalue_attc)
                integron_max.to_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[
                    (integron_max.evalue < config.evalue_attc)
                    & (abs(integron_max.pos_end -
                           integron_max.pos_beg) < config.max_attc_size) &
                    (config.min_attc_size <
                     abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info(
                    "Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max,
                                      intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config,
                       result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(
                        config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file,
                                    sep="\t",
                                    index=False,
                                    na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file,
                           sep="\t",
                           na_rep="NA",
                           index=False,
                           columns=[
                               'ID_replicon', 'ID_integron', 'complete', 'In0',
                               'CALIN'
                           ])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db,
                            config.distance_threshold)
                SeqIO.write(
                    replicon,
                    os.path.join(config.result_dir, replicon.id + ".gbk"),
                    "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(
            replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(
                result_tmp_dir, str(err)))

    return integron_file, summary_file