def test_parse_topology(self):
     topo = Topology('circ')
     for t in ('circ', 'circular', 'CIRC', 'CIRCULAR'):
         self.assertEqual(topo._parse_topology(t), 'circ')
     for t in ('lin', 'linear', 'LIN', 'LINEAR'):
         self.assertEqual(topo._parse_topology(t), 'lin')
     with self.assertRaises(RuntimeError) as ctx:
         topo._parse_topology('foo')
     self.assertEqual(str(ctx.exception), "'foo' is not allowed for topology")
 def test_parse(self):
     topo = Topology('circ')
     topo._parse(self.find_data('topology.txt'))
     expected = {
         'seq1': 'circ',
         'seq2': 'circ',
         'seq3': 'lin',
         'seq4': 'lin',
         'seq5': 'circ',
         'seq6': 'circ',
         'seq7': 'lin',
         'seq8': 'lin',
     }
     self.assertDictEqual(expected, topo._topology)
 def test_parse(self):
     topo = Topology('circ')
     topo._parse(self.find_data('topology.txt'))
     expected = {
                 'seq1': 'circ',
                 'seq2': 'circ',
                 'seq3': 'lin',
                 'seq4': 'lin',
                 'seq5': 'circ',
                 'seq6': 'circ',
                 'seq7': 'lin',
                 'seq8': 'lin',
                 }
     self.assertDictEqual(expected, topo._topology)
    def test_find_integrase_gembase(self):
        cfg = Config(self.args)
        self.args.gembase = True
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))

        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon_name + ".prt")

        shutil.copyfile(
            self.find_data(os.path.join('Proteins', replicon.id + ".prt")),
            prot_file)

        integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)

        for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res',
                       '_phage_int_table.res'):
            res = os.path.join(self.tmp_dir, replicon.id + suffix)
            self.assertTrue(os.path.exists(res))
Exemple #5
0
    def test_expand_linear_left(self):
        circular = False
        dist_threshold = 4000
        replicon_name = 'lian.001.c02.10'
        max_attc_size = 200
        min_attc_size = 40

        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        max_elt_input = pd.read_csv(
            os.path.join(self._data_dir, 'max_elt_input_1.csv'))
        df_max_input = pd.read_csv(
            os.path.join(self._data_dir, 'df_max_input_1.csv'))
        max_elt_expected = pd.read_csv(
            os.path.join(self._data_dir, 'max_elt_output_lian_left.csv'))
        max_eat_received = infernal.expand(replicon,
                                           934689,
                                           943099,
                                           max_elt_input,
                                           df_max_input,
                                           circular,
                                           dist_threshold,
                                           self.model_attc_path,
                                           max_attc_size,
                                           min_attc_size,
                                           search_left=True,
                                           search_right=False)
        pdt.assert_frame_equal(max_elt_expected, max_eat_received)
Exemple #6
0
    def setUp(self):
        if 'INTEGRON_HOME' in os.environ:
            self.integron_home = os.environ['INTEGRON_HOME']
            self.local_install = True
        else:
            self.local_install = False
            self.integron_home = os.path.normpath(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__), '..', '..')))

        self.tmp_dir = os.path.join(tempfile.gettempdir(),
                                    'tmp_test_integron_finder')
        if os.path.exists(self.tmp_dir) and os.path.isdir(self.tmp_dir):
            shutil.rmtree(self.tmp_dir)
        os.makedirs(self.tmp_dir)

        self.cmsearch = which('cmsearch')
        self.out_dir = self.tmp_dir
        self.model_attc_path = self.find_data(
            os.path.join('Models', 'attc_4.cm'))
        self.cpu_nb = 1
        replicon_name = 'lian.001.c02.10'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            self.replicon = next(sequences_db)
        self.evalue_attc = 1.
        self.max_attc_size = 200
        self.min_attc_size = 40
        self.length_cm = 47  # length in 'CLEN' (value for model attc_4.cm)
        self.call = call_wrapper()
        infernal.read_infernal = read_infernal_mock(self.tmp_dir)
    def test_find_integrase_no_gembase_with_protfile_empty(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 200

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
            open(prot_file, 'w').close()
            with self.assertRaises(EmptyFileError) as ctx:
                with self.catch_log():
                    integrase.find_integrase(replicon.id, prot_file,
                                             self.tmp_dir, cfg)
            self.assertTrue(
                re.match(
                    "^The protein file: '.*' is empty cannot perform hmmsearch on it.$",
                    str(ctx.exception)))
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integrase_no_gembase_no_protfile_no_prodigal(self):
        try:
            self.args.hmmsearch = 'foo'
            self.args.gembase = False
            cfg = Config(self.args)
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 500000

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")

            shutil.copyfile(
                self.find_data(os.path.join('Proteins', replicon.id + ".prt")),
                prot_file)

            with self.assertRaises(RuntimeError) as ctx:
                integrase.find_integrase(replicon.id, prot_file, self.tmp_dir,
                                         cfg)
            self.assertTrue(
                re.search(
                    "failed : \[Errno 2\] No such file or directory: 'foo'",
                    str(ctx.exception)))
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integrase_no_gembase_no_protfile_short_seq(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            prot_name = 'ACBA.007.P01_13'
            prot_path = self.find_data(
                os.path.join('Proteins', prot_name + '.prt'))

            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 200

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
            shutil.copyfile(prot_path, prot_file)

            integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)
            for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res',
                           '_phage_int_table.res'):
                res = os.path.join(self.tmp_dir, replicon.id + suffix)
                self.assertTrue(os.path.exists(res))
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integrase_no_gembase_no_protfile(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 500000

            prot_file = os.path.join(self.tmp_dir, "foo.prt")
            open(prot_file, 'w').close()
            with self.catch_log():
                with self.assertRaises(EmptyFileError) as ctx:
                    integrase.find_integrase(replicon.id, prot_file,
                                             self.tmp_dir, cfg)
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_attc_max_In0(self):
        replicon_name = 'ESCO001.B.00018.P002'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))

        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        integron = Integron(replicon, self.cfg)

        integrase = pd.DataFrame({'pos_beg': [90229],
                                  'pos_end': [91242],
                                  'strand': -1,
                                  'evalue': 1.400000e-24,
                                  'type_elt': 'protein',
                                  'annotation': 'intI',
                                  'model': 'intersection_tyr_intI',
                                  'distance_2attC': np.nan
                                  },
                                 index=['ESCO001.B.00018.P002_106'],
                                 columns=self.columns)
        integrase = integrase.astype(dtype=self.dtype)
        integron.integrase = integrase
        integrons = [integron]

        max_final = find_attc_max(integrons, replicon,
                                  self.cfg.distance_threshold, self.cfg.model_attc_path,
                                  self.cfg.max_attc_size, self.cfg.min_attc_size,
                                  circular=True,
                                  out_dir=self.tmp_dir)

        exp = pd.DataFrame(columns=self.max_cols)
        exp = exp.astype(dtype=self.max_dtype)
        pdt.assert_frame_equal(max_final, exp)
    def test_add_attc(self):
        replicon_name = "acba.007.p01.13"
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        data_attc_1 = {
            "pos_beg": 10,
            "pos_end": 100,
            "strand": -1,
            "evalue": 1.1e-07,
            "type_elt": "attC",
            "annotation": "attC",
            "model": "attc_4",
            "distance_2attC": np.nan
        }

        attc_1 = pd.DataFrame(data_attc_1,
                              columns=self.columns,
                              index=['attc_001'])
        attc_1 = attc_1.astype(dtype=self.dtype)

        integron = Integron(replicon, self.cfg)
        integron.add_attC(attc_1.loc['attc_001', 'pos_beg'],
                          attc_1.loc['attc_001',
                                     'pos_end'], attc_1.loc['attc_001',
                                                            'strand'],
                          attc_1.loc['attc_001',
                                     'evalue'], attc_1.loc['attc_001',
                                                           'model'])

        pdt.assert_frame_equal(attc_1, integron.attC)

        attc_2 = pd.DataFrame(data_attc_1,
                              columns=self.columns,
                              index=['attc_002'])
        attc_2 = attc_2.astype(dtype=self.dtype)
        attc_2['pos_beg'] = attc_2['pos_beg'] + 100
        attc_2['pos_end'] = attc_2['pos_end'] + 100
        attc_2["distance_2attC"] = (
            attc_2.loc['attc_002', 'pos_beg'] -
            attc_1.loc['attc_001', 'pos_end']) % len(replicon)

        attc = attc_1.append(attc_2)

        integron.add_attC(attc_2.loc['attc_002', 'pos_beg'],
                          attc_2.loc['attc_002',
                                     'pos_end'], attc_2.loc['attc_002',
                                                            'strand'],
                          attc_2.loc['attc_002',
                                     'evalue'], attc_2.loc['attc_002',
                                                           'model'])
        pdt.assert_frame_equal(attc, integron.attC)
    def setUp(self):
        if 'INTEGRON_HOME' in os.environ:
            self.integron_home = os.environ['INTEGRON_HOME']
            self.local_install = True
        else:
            self.local_install = False
            self.integron_home = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))

        self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder')
        if os.path.exists(self.tmp_dir) and os.path.isdir(self.tmp_dir):
            shutil.rmtree(self.tmp_dir)
        os.makedirs(self.tmp_dir)

        args = argparse.Namespace()
        args.attc_model = 'attc_4.cm'
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.eagle_eyes = False
        args.local_max = False
        self.cfg = Config(args)
        self.cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'OBAL001.B.00005.C001'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))

        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            self.replicon = next(sequences_db)

        self.integron = Integron(self.replicon, self.cfg)

        self.columns = ['pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation']
        self.dtype = {"pos_beg": 'int',
                      "pos_end": 'int',
                      "strand": 'int',
                      "evalue": 'float',
                      "type_elt": 'str',
                      "annotation": 'str',
                      "model": 'str',
                      "distance_2attC": 'float'}

        self.max_dtype = {'Accession_number': 'str',
                          'cm_attC': 'str',
                          'cm_debut': 'int',
                          'cm_fin': 'int',
                          'pos_beg': 'int',
                          'pos_end': 'int',
                          'sens': 'str',
                          'evalue': 'float'}
        self.max_cols = ['Accession_number', 'cm_attC', 'cm_debut', 'cm_fin', 'pos_beg', 'pos_end', 'sens', 'evalue']
    def test_find_integron_calin_threshold(self):
        replicon_name = 'ESCO001.B.00018.P002'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        replicon_results_path = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id)))
        attc_file = os.path.join(replicon_results_path,
                                 '{}_attc_table.res'.format(replicon.id))
        intI_file = os.path.join(replicon_results_path,
                                 '{}_intI.res'.format(replicon.id))
        phageI_file = os.path.join(replicon_results_path,
                                   '{}_phage_int.res'.format(replicon.id))

        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.distance_threshold = 4000
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.local_max = False
        args.gembase = False
        args.union_integrases = False
        args.calin_threshold = 2

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)

        self.assertEqual(len(integrons), 2)

        args.calin_threshold = 3
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
        self.assertEqual(len(integrons), 1)
 def test_parse_topology(self):
     topo = Topology('circ')
     for t in ('circ', 'circular', 'CIRC', 'CIRCULAR'):
         self.assertEqual(topo._parse_topology(t), 'circ')
     for t in ('lin', 'linear', 'LIN', 'LINEAR'):
         self.assertEqual(topo._parse_topology(t), 'lin')
     with self.assertRaises(RuntimeError) as ctx:
         topo._parse_topology('foo')
     self.assertEqual(str(ctx.exception), "'foo' is not allowed for topology")
Exemple #16
0
    def test_add_proteins(self):
        replicon_name = 'pssu.001.c01.13'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self._data_dir,
                                 '{}.prt.short'.format(replicon_name))

        args = argparse.Namespace()
        args.gembase = False
        args.annot_parser_name = None
        cfg = Config(args)
        integron = Integron(replicon, cfg)

        data_attc = {"pos_beg": [3072863, 3073496, 3074121, 3075059, 3075593, 3076281, 3076659],
                     "pos_end": [3072931, 3073555, 3074232, 3075118, 3075652, 3076340, 3076718],
                     "strand": [-1] * 7,
                     "evalue": [2.5e-06, 7e-08, 6.5e-08, 3.2e-06, 4.1e-07, 1.4e-08, 4e-08],
                     "type_elt": ['attC'] * 7,
                     "annotation": ['attC'] * 7,
                     "model": ['attc_4'] * 7,
                     "distance_2attC": [np.nan, 565.0, 566.0, 827.0, 475.0, 629.0, 319.0]}

        attC = pd.DataFrame(data_attc,
                            columns=self.columns,
                            index=['attc_00{}'.format(i) for i in range(len(data_attc['pos_beg']))])
        attC = attC.astype(dtype=self.dtype)

        integron.attC = attC
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        integron.add_proteins(prot_db)

        exp_proteins = pd.DataFrame({'pos_beg': [3071974, 3072950, 3074243, 3076720],
                                     'pos_end': [3072855, 3073468, 3075055, 3077511],
                                     'strand': [-1] * 4,
                                     'evalue': [np.nan] * 4,
                                     'type_elt': ['protein'] * 4,
                                     'annotation': ['protein'] * 4,
                                     'model': ['NA'] * 4,
                                     'distance_2attC': [np.nan] *4
                                     },
                                    index=['PSSU.001.C01_13_281{}'.format(i) for i in range(5, 9)],
                                    columns=self.columns
                                    )
        exp_proteins = exp_proteins.astype(dtype=self.dtype)
        pdt.assert_frame_equal(exp_proteins.sort_index(), integron.proteins.sort_index())
 def test_getitem(self):
     topo = Topology('circ', topology_file=self.find_data('topology.txt'))
     expected = {
         'seq1': 'circ',
         'seq2': 'circ',
         'seq3': 'lin',
         'seq4': 'lin',
         'seq5': 'circ',
         'seq6': 'circ',
         'seq7': 'lin',
         'seq8': 'lin',
     }
     for seqid, topology in expected.items():
         self.assertEqual(topology, topo[seqid])
     self.assertEqual('circ', topo['foo'])
Exemple #18
0
    def test_add_integrase(self):
        replicon_name = "acba.007.p01.13"
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        data_integrase = {"pos_beg": 55,
                          "pos_end": 1014,
                          "strand": 1,
                          "evalue": 1.900000e-25,
                          "type_elt": "protein",
                          "annotation": "intI",
                          "model": "intersection_tyr_intI",
                          "distance_2attC": np.nan}
        id_int = "ACBA.007.P01_13_1"

        df = pd.DataFrame(data_integrase,
                          columns=self.columns,
                          index=[id_int])
        df = df.astype(dtype=self.dtype)

        integron = Integron(replicon, self.cfg)
        integron.add_integrase(data_integrase["pos_beg"],
                               data_integrase["pos_end"],
                               id_int,
                               data_integrase["strand"],
                               data_integrase["evalue"],
                               data_integrase["model"]
                               )
        pdt.assert_frame_equal(df, integron.integrase)

        with self.assertRaises(RuntimeError) as ctx:
            integron.add_integrase(data_integrase["pos_beg"],
                                   data_integrase["pos_end"],
                                   id_int,
                                   data_integrase["strand"],
                                   data_integrase["evalue"],
                                   data_integrase["model"]
                                   )
        self.assertEqual(str(ctx.exception), "add_integrase should be called once.")
Exemple #19
0
    def setUp(self):
        """
        Define variables common to all tests
        """
        self.replicon_path = self.find_data(
            os.path.join('Replicons', "acba.007.p01.13.fst"))
        self.replicon_id = 'ACBA.007.P01_13'
        topologies = Topology('lin')
        with FastaIterator(self.replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            self.seq = next(sequences_db)

        self.prot_file = self.find_data(
            os.path.join("Results_Integron_Finder_acba.007.p01.13",
                         "tmp_{}".format(self.replicon_id),
                         "{}.prt".format(self.replicon_id)))
        args = argparse.Namespace()
        cfg = Config(args)
        self.prot_db = ProdigalDB(self.seq, cfg, prot_file=self.prot_file)
        self.dist_threshold = 4000
    def test_find_integrase_gembase_hmmer_error(self):
        self.args.gembase = True
        self.args.cpu = 'foo'
        cfg = Config(self.args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = os.path.join(self._data_dir, 'Replicons',
                                     replicon_name + '.fst')
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
        shutil.copyfile(
            os.path.join(self._data_dir, 'Proteins', replicon.id + ".prt"),
            prot_file)
        with self.assertRaises(RuntimeError) as ctx:
            integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)
        self.assertTrue(str(ctx.exception).endswith('failed return code = 1'))
    def test_find_integrase_gembase_no_hmmer_no_replicon(self):
        self.args.gembase = True
        self.args.hmmsearch = 'foo'
        cfg = Config(self.args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = os.path.join(self._data_dir, 'Replicons',
                                     replicon_name + '.fst')
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")

        with self.catch_log():
            with self.assertRaises(RuntimeError) as ctx:
                integrase.find_integrase(replicon.id, prot_file, self.tmp_dir,
                                         cfg)
            self.assertEqual(
                "The protein file: '{}' does not exists cannot perform hmmsearch on it."
                .format(prot_file), str(ctx.exception))
    def test_add_promoter(self):
        replicon_name = 'saen.040.p01.10'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        ## integron_finder.SIZE_REPLICON = 148711
        prot_file = os.path.join(self._data_dir, 'Proteins',
                                 '{}.prt'.format(replicon_name))

        # to test promoter we need to ad attC and integrase first
        # as add_promoter use attc and integrase
        attC = pd.DataFrame(
            {
                'pos_beg': [104651, 105162, 106018, 107567, 108423, 108743],
                'pos_end': [104710, 105221, 106087, 107626, 108482, 108832],
                'strand': [-1] * 6,
                'evalue': [
                    3.400000e-06, 7.500000e-09, 6.800000e-06, 2.800000e-07,
                    6.600000e-06, 1.800000e-04
                ],
                'type_elt': ['attC'] * 6,
                'annotation': ['attC'] * 6,
                'model': ['attc_4'] * 6,
                'distance_2attC': [np.nan, 452.0, 797.0, 1480.0, 797.0, 261.0]
            },
            index=['attc_00{}'.format(i) for i in range(1, 7)],
            columns=self.columns)
        attC = attC.astype(dtype=self.dtype)

        integrase = pd.DataFrame(
            {
                'pos_beg': 109469,
                'pos_end': 110482,
                'strand': 1,
                'evalue': 1.600000e-24,
                'type_elt': 'protein',
                'annotation': 'intI',
                'model': 'intersection_tyr_intI',
                'distance_2attC': np.nan
            },
            index=['SAEN.040.P01_10_135'],
            columns=self.columns)
        integrase = integrase.astype(dtype=self.dtype)

        ##########################################
        # test promoter with attC with integrase #
        ##########################################
        integron = Integron(replicon, self.cfg)
        integron.attC = attC
        integron.integrase = integrase

        integron.add_promoter()

        exp_promoters = pd.DataFrame(
            {
                'pos_beg': [109413, 109439],
                'pos_end': [109447, 109465],
                'strand': [1, -1],
                'evalue': [np.nan] * 2,
                'type_elt': ['Promoter'] * 2,
                'annotation': ['Pint_1', 'Pc_1'],
                'model': ['NA'] * 2,
                'distance_2attC': [np.nan] * 2
            },
            index=['P_intI1', 'Pc_int1'],
            columns=self.columns)
        exp_promoters = exp_promoters.astype(dtype=self.dtype)

        pdt.assert_frame_equal(exp_promoters, integron.promoter)

        #############################################
        # test promoter with attC without integrase #
        #############################################
        integron = Integron(replicon, self.cfg)
        integron.attC = attC
        integron.add_promoter()

        empty_promoter = pd.DataFrame(columns=self.columns)
        empty_promoter = empty_promoter.astype(dtype=self.dtype)

        pdt.assert_frame_equal(empty_promoter, integron.promoter)

        #############################################
        # test promoter without attC with integrase #
        #############################################
        integron = Integron(replicon, self.cfg)
        integron.integrase = integrase

        integron.add_promoter()

        pdt.assert_frame_equal(exp_promoters, integron.promoter)
    def test_find_integron_attC_is_df(self):
        replicon_name = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        attc_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_attc_table.res'.format(replicon.id)))

        intI_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_phage_int.res'.format(replicon.id)))

        args = argparse.Namespace()
        args.no_proteins = True
        args.keep_palindromes = True
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000
        args.calin_threshold = 2
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        len_model_attc = 47  # length in 'CLEN' (value for model attc_4.cm)

        attc_file = read_infernal(attc_file,
                                  replicon_name,
                                  len_model_attc,
                                  evalue=cfg.evalue_attc,
                                  size_max_attc=cfg.max_attc_size,
                                  size_min_attc=cfg.min_attc_size)
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        exp_msg = """In replicon {}, there are:
- 0 complete integron(s) found with a total 0 attC site(s)
- 1 CALIN element(s) found with a total of 3 attC site(s)
- 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)

        self.assertEqual(len(integrons), 1)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        exp = pd.DataFrame(
            {
                'annotation': ['attC'] * 3,
                'distance_2attC': [np.nan, 1196.0, 469.0],
                'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07],
                'model': ['attc_4'] * 3,
                'pos_beg': [17825, 19080, 19618],
                'pos_end': [17884, 19149, 19726],
                'strand': [-1, -1, -1],
                'type_elt': 'attC'
            },
            columns=self.columns,
            index=['attc_001', 'attc_002', 'attc_003'])
        pdt.assert_frame_equal(integron.attC, exp)

        exp = pd.DataFrame(columns=self.columns)
        exp = exp.astype(dtype=self.dtype)

        pdt.assert_frame_equal(integron.integrase, exp)
        pdt.assert_frame_equal(integron.promoter, exp)
        pdt.assert_frame_equal(integron.attI, exp)
        pdt.assert_frame_equal(integron.proteins, exp)
    def test_attI(self):
        replicon_name = 'saen.040.p01.10'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        attC = pd.DataFrame(
            {
                'pos_beg': [104651, 105162, 106018, 107567, 108423, 108743],
                'pos_end': [104710, 105221, 106087, 107626, 108482, 108832],
                'strand': [-1] * 6,
                'evalue': [
                    3.400000e-06, 7.500000e-09, 6.800000e-06, 2.800000e-07,
                    6.600000e-06, 1.800000e-04
                ],
                'type_elt': ['attC'] * 6,
                'annotation': ['attC'] * 6,
                'model': ['attc_4'] * 6,
                'distance_2attC': [np.nan, 452.0, 797.0, 1480.0, 797.0, 261.0]
            },
            index=['attc_00{}'.format(i) for i in range(1, 7)],
            columns=self.columns)
        attC = attC.astype(dtype=self.dtype)

        integrase = pd.DataFrame(
            {
                'pos_beg': 109469,
                'pos_end': 110482,
                'strand': 1,
                'evalue': 1.600000e-24,
                'type_elt': 'protein',
                'annotation': 'intI',
                'model': 'intersection_tyr_intI',
                'distance_2attC': np.nan
            },
            index=['SAEN.040.P01_10_135'],
            columns=self.columns)
        integrase = integrase.astype(dtype=self.dtype)

        ##########################################
        # test promoter with attC with integrase #
        ##########################################
        integron = Integron(replicon, self.cfg)
        integron.attC = attC
        integron.integrase = integrase

        exp_attI = pd.DataFrame(
            {
                'pos_beg': [109330],
                'pos_end': [109388],
                'strand': [-1],
                'evalue': [np.nan],
                'type_elt': 'attI',
                'annotation': 'attI_1',
                'model': 'NA',
                'distance_2attC': [np.nan]
            },
            index=['attI1'],
            columns=self.columns)
        exp_attI = exp_attI.astype(dtype=self.dtype)

        integron.add_attI()

        pdt.assert_frame_equal(exp_attI, integron.attI)

        #############################################
        # test promoter with attC without integrase #
        #############################################
        integron = Integron(replicon, self.cfg)
        integron.attC = attC

        empty_attI = pd.DataFrame(columns=self.columns)
        empty_attI = empty_attI.astype(dtype=self.dtype)

        integron.add_attI()

        pdt.assert_frame_equal(empty_attI, integron.attI)

        #############################################
        # test promoter without attC with integrase #
        #############################################
        integron = Integron(replicon, self.cfg)
        integron.integrase = integrase

        integron.add_attI()

        pdt.assert_frame_equal(exp_attI, integron.attI)
    def test_describe(self):
        replicon_name = "acba.007.p01.13"
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        args = argparse.Namespace()
        args.eagle_eyes = False
        args.local_max = False
        cfg = Config(args)

        integron = Integron(replicon, cfg)

        data_integrase = {
            "pos_beg": 55,
            "pos_end": 1014,
            "strand": 1,
            "evalue": 1.900000e-25,
            "type_elt": "protein",
            "annotation": "intI",
            "model": "intersection_tyr_intI",
            "distance_2attC": np.nan
        }

        id_int = "ACBA.007.P01_13_1"
        integrase = pd.DataFrame(data_integrase,
                                 columns=self.columns,
                                 index=[id_int])
        integrase = integrase.astype(dtype=self.dtype)

        data_attc = {
            "pos_beg": 10,
            "pos_end": 100,
            "strand": -1,
            "evalue": 1.1e-07,
            "type_elt": "attC",
            "annotation": "attC",
            "model": "attc_4",
            "distance_2attC": np.nan
        }

        attC = pd.DataFrame(data_attc,
                            columns=self.columns,
                            index=['attc_001'])
        attC = attC.astype(dtype=self.dtype)
        promoter = pd.DataFrame(data_attc,
                                columns=self.columns,
                                index=['prom_001'])
        promoter = promoter.astype(dtype=self.dtype)
        attI = pd.DataFrame(data_attc,
                            columns=self.columns,
                            index=['attI_001'])
        attI = attI.astype(dtype=self.dtype)
        proteins = pd.DataFrame(data_attc,
                                columns=self.columns,
                                index=['prot_001'])
        proteins = proteins.astype(dtype=self.dtype)

        excp_description = pd.concat(
            [integrase, attC, promoter, attI, proteins], ignore_index=False)
        excp_description = excp_description.reset_index()
        excp_description.columns = ["element"] + list(
            excp_description.columns[1:])
        excp_description["type"] = "complete"
        excp_description["ID_replicon"] = replicon.id
        excp_description["ID_integron"] = id(
            integron)  # uniq identifier of a given Integron
        excp_description["default"] = "Yes"
        excp_description["considered_topology"] = replicon.topology
        excp_description.drop_duplicates(subset=["element"], inplace=True)

        self.cfg._args.eagle_eyes = False
        self.cfg._args.eagle_eyes = False
        integron.integrase = integrase
        integron.attC = attC
        integron.promoter = promoter
        integron.attI = attI
        integron.proteins = proteins

        recieved_description = integron.describe()
        pdt.assert_frame_equal(recieved_description, excp_description)
    def test_integrons_report(self):
        replicon_name = "acba.007.p01.13"
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        args = argparse.Namespace()
        cfg = Config(args)
        cfg._args.eagle_eyes = False
        cfg._args.eagle_eyes = False
        cfg._args.local_max = False

        integron = Integron(replicon, cfg)
        columns = [
            'pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model',
            'distance_2attC', 'annotation'
        ]
        dtype = {
            "pos_beg": 'int',
            "pos_end": 'int',
            "strand": 'int',
            "evalue": 'float',
            "type_elt": 'str',
            "annotation": 'str',
            "model": 'str',
            "distance_2attC": 'float'
        }
        data_integrase = {
            "pos_beg": 55,
            "pos_end": 1014,
            "strand": 1,
            "evalue": 1.900000e-25,
            "type_elt": "protein",
            "annotation": "intI",
            "model": "intersection_tyr_intI",
            "distance_2attC": np.nan
        }
        id_int = "ACBA.007.P01_13_1"

        integrase = pd.DataFrame(data_integrase,
                                 columns=columns,
                                 index=[id_int])
        integrase = integrase.astype(dtype=dtype)

        data_attc = {
            "pos_beg": [17825, 19080, 19618],
            "pos_end": [17884, 19149, 19726],
            "strand": [-1] * 3,
            "evalue": [1.000000e-09, 1.000000e-04, 1.100000e-07],
            "type_elt": ["attC"] * 3,
            "annotation": ["attC"] * 3,
            "model": ["attc_4"] * 3,
            "distance_2attC": [np.nan, 1196.0, 469.0]
        }

        attC = pd.DataFrame(data_attc,
                            columns=columns,
                            index=['attc_00{}'.format(i) for i in range(1, 4)])
        attC = attC.astype(dtype=dtype)

        promoter = pd.DataFrame(
            {
                'pos_beg': 25,
                'pos_end': 51,
                'strand': -1,
                'evalue': np.nan,
                'type_elt': 'Promoter',
                'annotation': 'Pc_1',
                'model': np.nan,
                'distance_2attC': np.nan
            },
            index=['Pc_int1'],
            columns=columns)
        promoter = promoter.astype(dtype=dtype)

        proteins = pd.DataFrame(
            {
                'pos_beg': [17375, 17886, 19090, 19721],
                'pos_end': [17722, 18665, 19749, 20254],
                'strand': [-1] * 4,
                'evalue': [np.nan] * 4,
                'type_elt': ['protein'] * 4,
                'annotation': ['protein'] * 4,
                'model': [np.nan] * 4,
                'distance_2attC': [np.nan] * 4
            },
            index=['ACBA.007.P01_13_2{}'.format(i) for i in range(0, 4)],
            columns=columns)
        proteins = proteins.astype(dtype=dtype)

        integron.integrase = integrase
        integron.attC = attC
        integron.promoter = promoter
        integron.proteins = proteins
        report = results.integrons_report([integron])
        exp_report = pd.read_csv(self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         '{}.integrons'.format(replicon_name))),
                                 sep="\t")
        exp_report = exp_report.astype(dtype=dtype)
        pdt.assert_frame_equal(exp_report, report)
    def test_find_integron_proteins_circ_replicon(self):
        replicon_name = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))
        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        exp_result_dir = 'Results_Integron_Finder_acba.007.p01.13.circular'
        attc_file = self.find_data(
            os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_attc_table.res'.format(replicon.id)))
        intI_file = self.find_data(
            os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(
            os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_phage_int.res'.format(replicon.id)))
        args = argparse.Namespace()
        args.no_proteins = False
        args.keep_palindromes = True
        args.union_integrases = False
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False

        args = argparse.Namespace()
        args.evalue_attc = 1.
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.attc_model = 'attc_4.cm'
        args.no_proteins = False
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False
        args.union_integrases = False
        args.keep_palindromes = True
        args.calin_threshold = 2
        args.local_max = False

        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        exp_msg = """In replicon {}, there are:
- 1 complete integron(s) found with a total 3 attC site(s)
- 0 CALIN element(s) found with a total of 0 attC site(s)
- 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 1)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        exp = pd.DataFrame(
            {
                'annotation': 'intI',
                'distance_2attC': np.nan,
                'evalue': 1.900000e-25,
                'model': 'intersection_tyr_intI',
                'pos_beg': 55,
                'pos_end': 1014,
                'strand': 1,
                'type_elt': 'protein'
            },
            columns=self.columns,
            index=['ACBA.007.P01_13_1'])
        exp = exp.astype(dtype=self.dtype)
        pdt.assert_frame_equal(integron.integrase, exp)

        exp = pd.DataFrame(
            {
                'annotation': ['attC'] * 3,
                'distance_2attC': [np.nan, 1196.0, 469.0],
                'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07],
                'model': ['attc_4'] * 3,
                'pos_beg': [17825, 19080, 19618],
                'pos_end': [17884, 19149, 19726],
                'strand': [-1, -1, -1],
                'type_elt': 'attC'
            },
            columns=self.columns,
            index=['attc_001', 'attc_002', 'attc_003'])
        exp = exp.astype(dtype=self.dtype)
        pdt.assert_frame_equal(integron.attC, exp)

        exp = pd.DataFrame(columns=self.columns)
        exp = exp.astype(dtype=self.dtype)

        pdt.assert_frame_equal(integron.promoter, exp)
        pdt.assert_frame_equal(integron.attI, exp)
        pdt.assert_frame_equal(integron.proteins, exp)
    def test_find_integron_proteins_n_union_integrase(self):
        replicon_name = 'OBAL001.B.00005.C001'
        replicon_id = 'OBAL001.B.00005.C001'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_name + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        result_dir = 'Results_Integron_Finder_{}.union'.format(replicon_name)
        attc_file = self.find_data(
            os.path.join(result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_attc_table.res'.format(replicon.id)))
        intI_file = self.find_data(
            os.path.join(result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(
            os.path.join(result_dir, 'tmp_{}'.format(replicon.id),
                         '{}_phage_int.res'.format(replicon.id)))
        args = argparse.Namespace()
        args.evalue_attc = 1.
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000  # (4kb at least between 2 different arrays)
        args.calin_threshold = 2
        args.attc_model = 'attc_4.cm'
        args.no_proteins = False
        args.keep_palindromes = True
        args.union_integrases = True
        args.gembase = False  # needed by read_hmm which is called when no_proteins == False
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)
        exp_msg = """In replicon {}, there are:
- 3 complete integron(s) found with a total 4 attC site(s)
- 0 CALIN element(s) found with a total of 0 attC site(s)
- 2 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)
        self.assertEqual(len(integrons), 5)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)

        exp_int = []
        exp_int.append(
            pd.DataFrame([[
                418072, 419283, 1, 5.400000e-25, 'protein', 'Phage_integrase',
                np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_388'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                434671, 440118, -1, 0.085, 'protein', 'Phage_integrase',
                np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_399'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                516941, 517834, -1, 1.200000e-54, 'protein', 'Phage_integrase',
                np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_472'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                1940269, 1941171, 1, 4.200000e-43, 'protein',
                'Phage_integrase', np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_1793'
                                ]).astype(dtype=self.dtype))
        exp_int.append(
            pd.DataFrame([[
                1545830, 1546807, -1, 1.100000e-21, 'protein',
                'intersection_tyr_intI', np.nan, 'intI'
            ]],
                         columns=self.columns,
                         index=['OBAL001.B.00005.C001_1416'
                                ]).astype(dtype=self.dtype))

        exp_attC = []
        exp_attC.append(
            pd.DataFrame(
                [[421689, 421764, 1, 0.13, 'attC', 'attc_4', np.nan, 'attC']],
                columns=self.columns,
                index=['attc_001']).astype(dtype=self.dtype))
        exp_attC.append(
            pd.DataFrame([[
                442458, 442514, -1, 7.000000e-07, 'attC', 'attc_4', np.nan,
                'attC'
            ]],
                         columns=self.columns,
                         index=['attc_001']).astype(dtype=self.dtype))
        exp_attC.append(empty)
        exp_attC.append(empty)
        exp_attC.append(
            pd.DataFrame([[
                1547800, 1547859, 1, 0.00049, 'attC', 'attc_4', np.nan, 'attC'
            ], [1548775, 1548834, 1, 0.00009, 'attC', 'attc_4', 916.0, 'attC']
                          ],
                         columns=self.columns,
                         index=['attc_001',
                                'attc_002']).astype(dtype=self.dtype))

        for i, integron in enumerate(integrons):
            self.assertEqual(integron.replicon.name, replicon_id)
            pdt.assert_frame_equal(integron.integrase, exp_int[i])
            pdt.assert_frame_equal(integron.attC, exp_attC[i])
            pdt.assert_frame_equal(integron.promoter, empty)
            pdt.assert_frame_equal(integron.attI, empty)
            pdt.assert_frame_equal(integron.proteins, empty)
    def test_FastaIterator(self):
        file_name = 'multi_fasta'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        topologies = Topology('lin')
        with utils.FastaIterator(replicon_path) as seq_db:
            seq_db.topologies = topologies
            received_seq_id = sorted([seq.id for seq in seq_db])

        expected_seq_id = sorted(
            ['ACBA.007.P01_13', 'LIAN.001.C02_10', 'PSSU.001.C01_13'])
        self.assertListEqual(expected_seq_id, received_seq_id)
        self.assertEqual(len(seq_db), 3)

        expected_seq_name = expected_seq_id
        with utils.FastaIterator(replicon_path) as seq_db:
            seq_db.topologies = topologies
            received_seq_name = sorted([seq.name for seq in seq_db])
        self.assertListEqual(expected_seq_name, received_seq_name)

        replicon_name = 'foo'
        with utils.FastaIterator(replicon_path,
                                 replicon_name=replicon_name) as seq_db:
            seq_db.topologies = topologies
            received_seq_id = set([seq.name for seq in seq_db])
        expected_seq_name = set([replicon_name])
        self.assertSetEqual(expected_seq_name, received_seq_id)

        with utils.FastaIterator(replicon_path) as seq_db:
            received_seq_top = [seq.topology for seq in seq_db]
        expected_seq_top = ['lin', 'lin', 'lin']
        self.assertListEqual(expected_seq_top, received_seq_top)

        topologies_data = {
            'ACBA.007.P01_13': 'lin',
            'LIAN.001.C02_10': 'circ',
            'PSSU.001.C01_13': 'lin',
        }
        with tempfile.NamedTemporaryFile(mode='w') as topology_file:
            for rep, topo in topologies_data.items():
                topology_file.write("{} {}\n".format(rep, topo))
            topology_file.flush()
            topologies = Topology('lin', topology_file=topology_file.name)
            with utils.FastaIterator(replicon_path) as seq_db:
                seq_db.topologies = topologies
                received_seq_top = {seq.id: seq.topology for seq in seq_db}
            self.assertDictEqual(topologies_data, received_seq_top)

        file_name = 'acba_short'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        topologies = Topology('circ')
        with utils.FastaIterator(replicon_path) as seq_db:
            seq_db.topologies = topologies
            received_seq_top = [seq.topology for seq in seq_db]
        expected_seq_top = ['lin']
        self.assertListEqual(expected_seq_top, received_seq_top)

        file_name = 'replicon_ambiguous_char'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        with utils.FastaIterator(replicon_path) as seq_db:
            received_seq_id = sorted([seq.id for seq in seq_db if seq])
        expected_seq_id = sorted(['seq_1', 'seq_2', 'seq_3', 'seq_4'])
        self.assertListEqual(expected_seq_id, received_seq_id)

        file_name = 'replicon_bad_char'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        expected_warning = """sequence seq_(3|4) contains invalid characters, the sequence is skipped.
sequence seq_(3|4) contains invalid characters, the sequence is skipped."""
        with utils.FastaIterator(replicon_path) as seq_db:
            # 2 sequences are rejected so 2 message is produced (for seq 3 and seq 4)
            with self.catch_log() as log:
                received_seq_id = sorted([seq.id for seq in seq_db if seq])
                got_warning = log.get_value().strip()
        self.assertRegex(got_warning, expected_warning)
        expected_seq_id = sorted(['seq_1', 'seq_2'])
        self.assertListEqual(expected_seq_id, received_seq_id)

        file_name = 'replicon_too_short'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        expected_warning = """sequence seq_(4|2) is too short \(32 bp\), the sequence is skipped \(must be > 50bp\).
sequence seq_(4|2) is too short \(32 bp\), the sequence is skipped \(must be > 50bp\)."""
        with utils.FastaIterator(replicon_path) as seq_db:
            # 2 sequences are rejected so 2 messages are produced (for seq 2 & 4)
            with self.catch_log() as log:
                received_seq_id = sorted([seq.id for seq in seq_db if seq])
                got_warning = log.get_value().strip()

        self.assertRegex(got_warning, expected_warning)
        expected_seq_id = sorted(['seq_1', 'seq_3'])
        self.assertListEqual(expected_seq_id, received_seq_id)
Exemple #30
0
def main(args=None, loglevel=None):
    """
    main entry point to integron_finder

    :param str args: the arguments passed on the command line
    :param loglevel: the output verbosity
    :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
    """
    global _log

    args = sys.argv[1:] if args is None else args
    config = parse_args(args)

    ###################################
    # Prepare directories for results #
    ###################################

    # need to create directory before to init logger
    # as we write log in integron_finder.out in this dir

    if not os.path.exists(config.outdir):
        os.mkdir(config.outdir)
    else:
        if not os.path.isdir(config.outdir):
            msg = "outdir '{}' already exists and is not a directory".format(
                config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise IsADirectoryError(msg)

    if not os.path.exists(config.result_dir):
        os.mkdir(config.result_dir)
    else:
        if not os.path.isdir(config.result_dir):
            msg = "result dir '{}' already exists and is not a directory".format(
                config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise IsADirectoryError(msg)
        elif not os.access(config.result_dir, os.W_OK):
            msg = "result dir '{}' already exists and is not writable".format(
                config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise PermissionError(msg)

    ####################
    # init the loggers #
    ####################
    log_file = os.path.join(config.result_dir, 'integron_finder.out')
    integron_finder.init_logger(log_file=log_file, out=not config.mute)

    _log = colorlog.getLogger('integron_finder')

    if not loglevel:
        # logs are specify from args options
        logger_set_level(config.log_level)
    else:
        # used by unit tests to mute or unmute logs
        logger_set_level(loglevel)

    #######################################
    # do last config check before running #
    #######################################
    if config.cmsearch is None:
        msg = """cannot find 'cmsearch' in PATH.
Please install infernal package or setup 'cmsearch' binary path with --cmsearch option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    if config.hmmsearch is None:
        msg = """cannot find 'hmmsearch' in PATH.
Please install hmmer package or setup 'hmmsearch' binary path with --hmmsearch option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    if config.prodigal is None:
        msg = """cannot find 'prodigal' in PATH.
Please install prodigal package or setup 'prodigal' binary path with --prodigal option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    ################
    # print Header #
    ################
    log_header = colorlog.getLogger('integron_finder.header')
    logging = colorlog.logging.logging
    handlers = []
    header_log_file = logging.FileHandler(log_file)
    handlers.append(header_log_file)
    if not config.mute:
        header_stream = colorlog.StreamHandler(sys.stdout)
        handlers.append(header_stream)
    formatter = colorlog.ColoredFormatter("%(message)s")
    for h in handlers:
        h.setFormatter(formatter)
        log_header.addHandler(h)
    log_header.setLevel(colorlog.logging.logging.INFO)
    log_header.propagate = False
    log_header.info(header(args))

    with utils.FastaIterator(
            config.input_seq_path,
            dist_threshold=config.distance_threshold) as sequences_db:
        ################
        # set topology #
        ################
        default_topology = 'circ' if len(sequences_db) == 1 else 'lin'
        if config.linear:
            default_topology = 'lin'
        elif config.circular:
            default_topology = 'circ'
        # the both options are mutually exclusive
        topologies = Topology(default_topology,
                              topology_file=config.topology_file)

        # allow sequences_db to inject topology information
        # in seq.topology attribute
        sequences_db.topologies = topologies

        ##############
        # do the job #
        ##############
        sequences_db_len = len(sequences_db)
        all_integrons = []
        all_summaries = []
        for rep_no, replicon in enumerate(sequences_db, 1):
            # if replicon contains illegal characters
            # or replicon is too short < 50 bp
            # then replicon is None
            if replicon is not None:
                _log.info(
                    "############ Processing replicon {} ({}/{}) ############\n"
                    .format(replicon.id, rep_no, sequences_db_len))
                integron_res, summary = find_integron_in_one_replicon(
                    replicon, config)
                if integron_res:
                    all_integrons.append(integron_res)
                if summary:
                    all_summaries.append(summary)
            else:
                _log.warning(
                    "############ Skipping replicon {}/{} ############".format(
                        rep_no, sequences_db_len))

    if not config.split_results:
        _log.info("Merging integrons results.\n")
        agg_integrons = results.merge_results(*all_integrons)
        agg_summary = results.merge_results(*all_summaries)
        outfile_base_name = os.path.join(
            config.result_dir, utils.get_name_from_path(config.input_seq_path))
        merged_integron_file = outfile_base_name + ".integrons"
        if not agg_integrons.empty:
            agg_integrons.to_csv(merged_integron_file,
                                 sep="\t",
                                 index=False,
                                 na_rep="NA")
        else:
            with open(merged_integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
        merged_summary_file = outfile_base_name + ".summary"
        if not agg_integrons.empty:
            agg_summary.to_csv(merged_summary_file,
                               sep="\t",
                               index=False,
                               na_rep="NA",
                               columns=[
                                   'ID_replicon', 'ID_integron', 'complete',
                                   'In0', 'CALIN'
                               ])

        for _file in all_integrons + all_summaries:
            if _file != merged_integron_file and _file != merged_summary_file:
                # in special case where the merged file has the same name that a replicon result file
                os.unlink(_file)