def test_parse_topology(self): topo = Topology('circ') for t in ('circ', 'circular', 'CIRC', 'CIRCULAR'): self.assertEqual(topo._parse_topology(t), 'circ') for t in ('lin', 'linear', 'LIN', 'LINEAR'): self.assertEqual(topo._parse_topology(t), 'lin') with self.assertRaises(RuntimeError) as ctx: topo._parse_topology('foo') self.assertEqual(str(ctx.exception), "'foo' is not allowed for topology")
def test_parse(self): topo = Topology('circ') topo._parse(self.find_data('topology.txt')) expected = { 'seq1': 'circ', 'seq2': 'circ', 'seq3': 'lin', 'seq4': 'lin', 'seq5': 'circ', 'seq6': 'circ', 'seq7': 'lin', 'seq8': 'lin', } self.assertDictEqual(expected, topo._topology)
def test_find_integrase_gembase(self): cfg = Config(self.args) self.args.gembase = True cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self.tmp_dir, replicon_name + ".prt") shutil.copyfile( self.find_data(os.path.join('Proteins', replicon.id + ".prt")), prot_file) integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res', '_phage_int_table.res'): res = os.path.join(self.tmp_dir, replicon.id + suffix) self.assertTrue(os.path.exists(res))
def test_expand_linear_left(self): circular = False dist_threshold = 4000 replicon_name = 'lian.001.c02.10' max_attc_size = 200 min_attc_size = 40 replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) max_elt_input = pd.read_csv( os.path.join(self._data_dir, 'max_elt_input_1.csv')) df_max_input = pd.read_csv( os.path.join(self._data_dir, 'df_max_input_1.csv')) max_elt_expected = pd.read_csv( os.path.join(self._data_dir, 'max_elt_output_lian_left.csv')) max_eat_received = infernal.expand(replicon, 934689, 943099, max_elt_input, df_max_input, circular, dist_threshold, self.model_attc_path, max_attc_size, min_attc_size, search_left=True, search_right=False) pdt.assert_frame_equal(max_elt_expected, max_eat_received)
def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..'))) self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder') if os.path.exists(self.tmp_dir) and os.path.isdir(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.makedirs(self.tmp_dir) self.cmsearch = which('cmsearch') self.out_dir = self.tmp_dir self.model_attc_path = self.find_data( os.path.join('Models', 'attc_4.cm')) self.cpu_nb = 1 replicon_name = 'lian.001.c02.10' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies self.replicon = next(sequences_db) self.evalue_attc = 1. self.max_attc_size = 200 self.min_attc_size = 40 self.length_cm = 47 # length in 'CLEN' (value for model attc_4.cm) self.call = call_wrapper() infernal.read_infernal = read_infernal_mock(self.tmp_dir)
def test_find_integrase_no_gembase_with_protfile_empty(self): try: cfg = Config(self.args) self.args.gembase = False cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 200 prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") open(prot_file, 'w').close() with self.assertRaises(EmptyFileError) as ctx: with self.catch_log(): integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertTrue( re.match( "^The protein file: '.*' is empty cannot perform hmmsearch on it.$", str(ctx.exception))) finally: replicon.__class__.__len__ = len_ori
def test_find_integrase_no_gembase_no_protfile_no_prodigal(self): try: self.args.hmmsearch = 'foo' self.args.gembase = False cfg = Config(self.args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 500000 prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") shutil.copyfile( self.find_data(os.path.join('Proteins', replicon.id + ".prt")), prot_file) with self.assertRaises(RuntimeError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertTrue( re.search( "failed : \[Errno 2\] No such file or directory: 'foo'", str(ctx.exception))) finally: replicon.__class__.__len__ = len_ori
def test_find_integrase_no_gembase_no_protfile_short_seq(self): try: cfg = Config(self.args) self.args.gembase = False cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_name = 'ACBA.007.P01_13' prot_path = self.find_data( os.path.join('Proteins', prot_name + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 200 prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") shutil.copyfile(prot_path, prot_file) integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res', '_phage_int_table.res'): res = os.path.join(self.tmp_dir, replicon.id + suffix) self.assertTrue(os.path.exists(res)) finally: replicon.__class__.__len__ = len_ori
def test_find_integrase_no_gembase_no_protfile(self): try: cfg = Config(self.args) self.args.gembase = False cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 500000 prot_file = os.path.join(self.tmp_dir, "foo.prt") open(prot_file, 'w').close() with self.catch_log(): with self.assertRaises(EmptyFileError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) finally: replicon.__class__.__len__ = len_ori
def test_find_attc_max_In0(self): replicon_name = 'ESCO001.B.00018.P002' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) integron = Integron(replicon, self.cfg) integrase = pd.DataFrame({'pos_beg': [90229], 'pos_end': [91242], 'strand': -1, 'evalue': 1.400000e-24, 'type_elt': 'protein', 'annotation': 'intI', 'model': 'intersection_tyr_intI', 'distance_2attC': np.nan }, index=['ESCO001.B.00018.P002_106'], columns=self.columns) integrase = integrase.astype(dtype=self.dtype) integron.integrase = integrase integrons = [integron] max_final = find_attc_max(integrons, replicon, self.cfg.distance_threshold, self.cfg.model_attc_path, self.cfg.max_attc_size, self.cfg.min_attc_size, circular=True, out_dir=self.tmp_dir) exp = pd.DataFrame(columns=self.max_cols) exp = exp.astype(dtype=self.max_dtype) pdt.assert_frame_equal(max_final, exp)
def test_add_attc(self): replicon_name = "acba.007.p01.13" replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) data_attc_1 = { "pos_beg": 10, "pos_end": 100, "strand": -1, "evalue": 1.1e-07, "type_elt": "attC", "annotation": "attC", "model": "attc_4", "distance_2attC": np.nan } attc_1 = pd.DataFrame(data_attc_1, columns=self.columns, index=['attc_001']) attc_1 = attc_1.astype(dtype=self.dtype) integron = Integron(replicon, self.cfg) integron.add_attC(attc_1.loc['attc_001', 'pos_beg'], attc_1.loc['attc_001', 'pos_end'], attc_1.loc['attc_001', 'strand'], attc_1.loc['attc_001', 'evalue'], attc_1.loc['attc_001', 'model']) pdt.assert_frame_equal(attc_1, integron.attC) attc_2 = pd.DataFrame(data_attc_1, columns=self.columns, index=['attc_002']) attc_2 = attc_2.astype(dtype=self.dtype) attc_2['pos_beg'] = attc_2['pos_beg'] + 100 attc_2['pos_end'] = attc_2['pos_end'] + 100 attc_2["distance_2attC"] = ( attc_2.loc['attc_002', 'pos_beg'] - attc_1.loc['attc_001', 'pos_end']) % len(replicon) attc = attc_1.append(attc_2) integron.add_attC(attc_2.loc['attc_002', 'pos_beg'], attc_2.loc['attc_002', 'pos_end'], attc_2.loc['attc_002', 'strand'], attc_2.loc['attc_002', 'evalue'], attc_2.loc['attc_002', 'model']) pdt.assert_frame_equal(attc, integron.attC)
def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder') if os.path.exists(self.tmp_dir) and os.path.isdir(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.makedirs(self.tmp_dir) args = argparse.Namespace() args.attc_model = 'attc_4.cm' args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.eagle_eyes = False args.local_max = False self.cfg = Config(args) self.cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'OBAL001.B.00005.C001' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies self.replicon = next(sequences_db) self.integron = Integron(self.replicon, self.cfg) self.columns = ['pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation'] self.dtype = {"pos_beg": 'int', "pos_end": 'int', "strand": 'int', "evalue": 'float', "type_elt": 'str', "annotation": 'str', "model": 'str', "distance_2attC": 'float'} self.max_dtype = {'Accession_number': 'str', 'cm_attC': 'str', 'cm_debut': 'int', 'cm_fin': 'int', 'pos_beg': 'int', 'pos_end': 'int', 'sens': 'str', 'evalue': 'float'} self.max_cols = ['Accession_number', 'cm_attC', 'cm_debut', 'cm_fin', 'pos_beg', 'pos_end', 'sens', 'evalue']
def test_find_integron_calin_threshold(self): replicon_name = 'ESCO001.B.00018.P002' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_name + '.prt')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) replicon_results_path = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id))) attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id)) intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id)) phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id)) args = argparse.Namespace() args.no_proteins = False args.keep_palindromes = True args.distance_threshold = 4000 args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.local_max = False args.gembase = False args.union_integrases = False args.calin_threshold = 2 cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) self.assertEqual(len(integrons), 2) args.calin_threshold = 3 cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) self.assertEqual(len(integrons), 1)
def test_add_proteins(self): replicon_name = 'pssu.001.c01.13' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self._data_dir, '{}.prt.short'.format(replicon_name)) args = argparse.Namespace() args.gembase = False args.annot_parser_name = None cfg = Config(args) integron = Integron(replicon, cfg) data_attc = {"pos_beg": [3072863, 3073496, 3074121, 3075059, 3075593, 3076281, 3076659], "pos_end": [3072931, 3073555, 3074232, 3075118, 3075652, 3076340, 3076718], "strand": [-1] * 7, "evalue": [2.5e-06, 7e-08, 6.5e-08, 3.2e-06, 4.1e-07, 1.4e-08, 4e-08], "type_elt": ['attC'] * 7, "annotation": ['attC'] * 7, "model": ['attc_4'] * 7, "distance_2attC": [np.nan, 565.0, 566.0, 827.0, 475.0, 629.0, 319.0]} attC = pd.DataFrame(data_attc, columns=self.columns, index=['attc_00{}'.format(i) for i in range(len(data_attc['pos_beg']))]) attC = attC.astype(dtype=self.dtype) integron.attC = attC prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) integron.add_proteins(prot_db) exp_proteins = pd.DataFrame({'pos_beg': [3071974, 3072950, 3074243, 3076720], 'pos_end': [3072855, 3073468, 3075055, 3077511], 'strand': [-1] * 4, 'evalue': [np.nan] * 4, 'type_elt': ['protein'] * 4, 'annotation': ['protein'] * 4, 'model': ['NA'] * 4, 'distance_2attC': [np.nan] *4 }, index=['PSSU.001.C01_13_281{}'.format(i) for i in range(5, 9)], columns=self.columns ) exp_proteins = exp_proteins.astype(dtype=self.dtype) pdt.assert_frame_equal(exp_proteins.sort_index(), integron.proteins.sort_index())
def test_getitem(self): topo = Topology('circ', topology_file=self.find_data('topology.txt')) expected = { 'seq1': 'circ', 'seq2': 'circ', 'seq3': 'lin', 'seq4': 'lin', 'seq5': 'circ', 'seq6': 'circ', 'seq7': 'lin', 'seq8': 'lin', } for seqid, topology in expected.items(): self.assertEqual(topology, topo[seqid]) self.assertEqual('circ', topo['foo'])
def test_add_integrase(self): replicon_name = "acba.007.p01.13" replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) data_integrase = {"pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.900000e-25, "type_elt": "protein", "annotation": "intI", "model": "intersection_tyr_intI", "distance_2attC": np.nan} id_int = "ACBA.007.P01_13_1" df = pd.DataFrame(data_integrase, columns=self.columns, index=[id_int]) df = df.astype(dtype=self.dtype) integron = Integron(replicon, self.cfg) integron.add_integrase(data_integrase["pos_beg"], data_integrase["pos_end"], id_int, data_integrase["strand"], data_integrase["evalue"], data_integrase["model"] ) pdt.assert_frame_equal(df, integron.integrase) with self.assertRaises(RuntimeError) as ctx: integron.add_integrase(data_integrase["pos_beg"], data_integrase["pos_end"], id_int, data_integrase["strand"], data_integrase["evalue"], data_integrase["model"] ) self.assertEqual(str(ctx.exception), "add_integrase should be called once.")
def setUp(self): """ Define variables common to all tests """ self.replicon_path = self.find_data( os.path.join('Replicons', "acba.007.p01.13.fst")) self.replicon_id = 'ACBA.007.P01_13' topologies = Topology('lin') with FastaIterator(self.replicon_path) as sequences_db: sequences_db.topologies = topologies self.seq = next(sequences_db) self.prot_file = self.find_data( os.path.join("Results_Integron_Finder_acba.007.p01.13", "tmp_{}".format(self.replicon_id), "{}.prt".format(self.replicon_id))) args = argparse.Namespace() cfg = Config(args) self.prot_db = ProdigalDB(self.seq, cfg, prot_file=self.prot_file) self.dist_threshold = 4000
def test_find_integrase_gembase_hmmer_error(self): self.args.gembase = True self.args.cpu = 'foo' cfg = Config(self.args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = os.path.join(self._data_dir, 'Replicons', replicon_name + '.fst') topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") shutil.copyfile( os.path.join(self._data_dir, 'Proteins', replicon.id + ".prt"), prot_file) with self.assertRaises(RuntimeError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertTrue(str(ctx.exception).endswith('failed return code = 1'))
def test_find_integrase_gembase_no_hmmer_no_replicon(self): self.args.gembase = True self.args.hmmsearch = 'foo' cfg = Config(self.args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = os.path.join(self._data_dir, 'Replicons', replicon_name + '.fst') topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") with self.catch_log(): with self.assertRaises(RuntimeError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertEqual( "The protein file: '{}' does not exists cannot perform hmmsearch on it." .format(prot_file), str(ctx.exception))
def test_add_promoter(self): replicon_name = 'saen.040.p01.10' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) ## integron_finder.SIZE_REPLICON = 148711 prot_file = os.path.join(self._data_dir, 'Proteins', '{}.prt'.format(replicon_name)) # to test promoter we need to ad attC and integrase first # as add_promoter use attc and integrase attC = pd.DataFrame( { 'pos_beg': [104651, 105162, 106018, 107567, 108423, 108743], 'pos_end': [104710, 105221, 106087, 107626, 108482, 108832], 'strand': [-1] * 6, 'evalue': [ 3.400000e-06, 7.500000e-09, 6.800000e-06, 2.800000e-07, 6.600000e-06, 1.800000e-04 ], 'type_elt': ['attC'] * 6, 'annotation': ['attC'] * 6, 'model': ['attc_4'] * 6, 'distance_2attC': [np.nan, 452.0, 797.0, 1480.0, 797.0, 261.0] }, index=['attc_00{}'.format(i) for i in range(1, 7)], columns=self.columns) attC = attC.astype(dtype=self.dtype) integrase = pd.DataFrame( { 'pos_beg': 109469, 'pos_end': 110482, 'strand': 1, 'evalue': 1.600000e-24, 'type_elt': 'protein', 'annotation': 'intI', 'model': 'intersection_tyr_intI', 'distance_2attC': np.nan }, index=['SAEN.040.P01_10_135'], columns=self.columns) integrase = integrase.astype(dtype=self.dtype) ########################################## # test promoter with attC with integrase # ########################################## integron = Integron(replicon, self.cfg) integron.attC = attC integron.integrase = integrase integron.add_promoter() exp_promoters = pd.DataFrame( { 'pos_beg': [109413, 109439], 'pos_end': [109447, 109465], 'strand': [1, -1], 'evalue': [np.nan] * 2, 'type_elt': ['Promoter'] * 2, 'annotation': ['Pint_1', 'Pc_1'], 'model': ['NA'] * 2, 'distance_2attC': [np.nan] * 2 }, index=['P_intI1', 'Pc_int1'], columns=self.columns) exp_promoters = exp_promoters.astype(dtype=self.dtype) pdt.assert_frame_equal(exp_promoters, integron.promoter) ############################################# # test promoter with attC without integrase # ############################################# integron = Integron(replicon, self.cfg) integron.attC = attC integron.add_promoter() empty_promoter = pd.DataFrame(columns=self.columns) empty_promoter = empty_promoter.astype(dtype=self.dtype) pdt.assert_frame_equal(empty_promoter, integron.promoter) ############################################# # test promoter without attC with integrase # ############################################# integron = Integron(replicon, self.cfg) integron.integrase = integrase integron.add_promoter() pdt.assert_frame_equal(exp_promoters, integron.promoter)
def test_find_integron_attC_is_df(self): replicon_name = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_id + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) attc_file = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.no_proteins = True args.keep_palindromes = True args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 args.calin_threshold = 2 args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') len_model_attc = 47 # length in 'CLEN' (value for model attc_4.cm) attc_file = read_infernal(attc_file, replicon_name, len_model_attc, evalue=cfg.evalue_attc, size_max_attc=cfg.max_attc_size, size_min_attc=cfg.min_attc_size) prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 0 complete integron(s) found with a total 0 attC site(s) - 1 CALIN element(s) found with a total of 3 attC site(s) - 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 1) integron = integrons[0] self.assertEqual(integron.replicon.name, replicon_id) exp = pd.DataFrame( { 'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC' }, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) pdt.assert_frame_equal(integron.attC, exp) exp = pd.DataFrame(columns=self.columns) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.integrase, exp) pdt.assert_frame_equal(integron.promoter, exp) pdt.assert_frame_equal(integron.attI, exp) pdt.assert_frame_equal(integron.proteins, exp)
def test_attI(self): replicon_name = 'saen.040.p01.10' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) attC = pd.DataFrame( { 'pos_beg': [104651, 105162, 106018, 107567, 108423, 108743], 'pos_end': [104710, 105221, 106087, 107626, 108482, 108832], 'strand': [-1] * 6, 'evalue': [ 3.400000e-06, 7.500000e-09, 6.800000e-06, 2.800000e-07, 6.600000e-06, 1.800000e-04 ], 'type_elt': ['attC'] * 6, 'annotation': ['attC'] * 6, 'model': ['attc_4'] * 6, 'distance_2attC': [np.nan, 452.0, 797.0, 1480.0, 797.0, 261.0] }, index=['attc_00{}'.format(i) for i in range(1, 7)], columns=self.columns) attC = attC.astype(dtype=self.dtype) integrase = pd.DataFrame( { 'pos_beg': 109469, 'pos_end': 110482, 'strand': 1, 'evalue': 1.600000e-24, 'type_elt': 'protein', 'annotation': 'intI', 'model': 'intersection_tyr_intI', 'distance_2attC': np.nan }, index=['SAEN.040.P01_10_135'], columns=self.columns) integrase = integrase.astype(dtype=self.dtype) ########################################## # test promoter with attC with integrase # ########################################## integron = Integron(replicon, self.cfg) integron.attC = attC integron.integrase = integrase exp_attI = pd.DataFrame( { 'pos_beg': [109330], 'pos_end': [109388], 'strand': [-1], 'evalue': [np.nan], 'type_elt': 'attI', 'annotation': 'attI_1', 'model': 'NA', 'distance_2attC': [np.nan] }, index=['attI1'], columns=self.columns) exp_attI = exp_attI.astype(dtype=self.dtype) integron.add_attI() pdt.assert_frame_equal(exp_attI, integron.attI) ############################################# # test promoter with attC without integrase # ############################################# integron = Integron(replicon, self.cfg) integron.attC = attC empty_attI = pd.DataFrame(columns=self.columns) empty_attI = empty_attI.astype(dtype=self.dtype) integron.add_attI() pdt.assert_frame_equal(empty_attI, integron.attI) ############################################# # test promoter without attC with integrase # ############################################# integron = Integron(replicon, self.cfg) integron.integrase = integrase integron.add_attI() pdt.assert_frame_equal(exp_attI, integron.attI)
def test_describe(self): replicon_name = "acba.007.p01.13" replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) args = argparse.Namespace() args.eagle_eyes = False args.local_max = False cfg = Config(args) integron = Integron(replicon, cfg) data_integrase = { "pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.900000e-25, "type_elt": "protein", "annotation": "intI", "model": "intersection_tyr_intI", "distance_2attC": np.nan } id_int = "ACBA.007.P01_13_1" integrase = pd.DataFrame(data_integrase, columns=self.columns, index=[id_int]) integrase = integrase.astype(dtype=self.dtype) data_attc = { "pos_beg": 10, "pos_end": 100, "strand": -1, "evalue": 1.1e-07, "type_elt": "attC", "annotation": "attC", "model": "attc_4", "distance_2attC": np.nan } attC = pd.DataFrame(data_attc, columns=self.columns, index=['attc_001']) attC = attC.astype(dtype=self.dtype) promoter = pd.DataFrame(data_attc, columns=self.columns, index=['prom_001']) promoter = promoter.astype(dtype=self.dtype) attI = pd.DataFrame(data_attc, columns=self.columns, index=['attI_001']) attI = attI.astype(dtype=self.dtype) proteins = pd.DataFrame(data_attc, columns=self.columns, index=['prot_001']) proteins = proteins.astype(dtype=self.dtype) excp_description = pd.concat( [integrase, attC, promoter, attI, proteins], ignore_index=False) excp_description = excp_description.reset_index() excp_description.columns = ["element"] + list( excp_description.columns[1:]) excp_description["type"] = "complete" excp_description["ID_replicon"] = replicon.id excp_description["ID_integron"] = id( integron) # uniq identifier of a given Integron excp_description["default"] = "Yes" excp_description["considered_topology"] = replicon.topology excp_description.drop_duplicates(subset=["element"], inplace=True) self.cfg._args.eagle_eyes = False self.cfg._args.eagle_eyes = False integron.integrase = integrase integron.attC = attC integron.promoter = promoter integron.attI = attI integron.proteins = proteins recieved_description = integron.describe() pdt.assert_frame_equal(recieved_description, excp_description)
def test_integrons_report(self): replicon_name = "acba.007.p01.13" replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) args = argparse.Namespace() cfg = Config(args) cfg._args.eagle_eyes = False cfg._args.eagle_eyes = False cfg._args.local_max = False integron = Integron(replicon, cfg) columns = [ 'pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation' ] dtype = { "pos_beg": 'int', "pos_end": 'int', "strand": 'int', "evalue": 'float', "type_elt": 'str', "annotation": 'str', "model": 'str', "distance_2attC": 'float' } data_integrase = { "pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.900000e-25, "type_elt": "protein", "annotation": "intI", "model": "intersection_tyr_intI", "distance_2attC": np.nan } id_int = "ACBA.007.P01_13_1" integrase = pd.DataFrame(data_integrase, columns=columns, index=[id_int]) integrase = integrase.astype(dtype=dtype) data_attc = { "pos_beg": [17825, 19080, 19618], "pos_end": [17884, 19149, 19726], "strand": [-1] * 3, "evalue": [1.000000e-09, 1.000000e-04, 1.100000e-07], "type_elt": ["attC"] * 3, "annotation": ["attC"] * 3, "model": ["attc_4"] * 3, "distance_2attC": [np.nan, 1196.0, 469.0] } attC = pd.DataFrame(data_attc, columns=columns, index=['attc_00{}'.format(i) for i in range(1, 4)]) attC = attC.astype(dtype=dtype) promoter = pd.DataFrame( { 'pos_beg': 25, 'pos_end': 51, 'strand': -1, 'evalue': np.nan, 'type_elt': 'Promoter', 'annotation': 'Pc_1', 'model': np.nan, 'distance_2attC': np.nan }, index=['Pc_int1'], columns=columns) promoter = promoter.astype(dtype=dtype) proteins = pd.DataFrame( { 'pos_beg': [17375, 17886, 19090, 19721], 'pos_end': [17722, 18665, 19749, 20254], 'strand': [-1] * 4, 'evalue': [np.nan] * 4, 'type_elt': ['protein'] * 4, 'annotation': ['protein'] * 4, 'model': [np.nan] * 4, 'distance_2attC': [np.nan] * 4 }, index=['ACBA.007.P01_13_2{}'.format(i) for i in range(0, 4)], columns=columns) proteins = proteins.astype(dtype=dtype) integron.integrase = integrase integron.attC = attC integron.promoter = promoter integron.proteins = proteins report = results.integrons_report([integron]) exp_report = pd.read_csv(self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), '{}.integrons'.format(replicon_name))), sep="\t") exp_report = exp_report.astype(dtype=dtype) pdt.assert_frame_equal(exp_report, report)
def test_find_integron_proteins_circ_replicon(self): replicon_name = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_id + '.prt')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) exp_result_dir = 'Results_Integron_Finder_acba.007.p01.13.circular' attc_file = self.find_data( os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data( os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data( os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.no_proteins = False args.keep_palindromes = True args.union_integrases = False args.gembase = False # needed by read_hmm which is called when no_proteins == False args = argparse.Namespace() args.evalue_attc = 1. args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.attc_model = 'attc_4.cm' args.no_proteins = False args.gembase = False # needed by read_hmm which is called when no_proteins == False args.union_integrases = False args.keep_palindromes = True args.calin_threshold = 2 args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 1 complete integron(s) found with a total 3 attC site(s) - 0 CALIN element(s) found with a total of 0 attC site(s) - 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 1) integron = integrons[0] self.assertEqual(integron.replicon.name, replicon_id) exp = pd.DataFrame( { 'annotation': 'intI', 'distance_2attC': np.nan, 'evalue': 1.900000e-25, 'model': 'intersection_tyr_intI', 'pos_beg': 55, 'pos_end': 1014, 'strand': 1, 'type_elt': 'protein' }, columns=self.columns, index=['ACBA.007.P01_13_1']) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.integrase, exp) exp = pd.DataFrame( { 'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC' }, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.attC, exp) exp = pd.DataFrame(columns=self.columns) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.promoter, exp) pdt.assert_frame_equal(integron.attI, exp) pdt.assert_frame_equal(integron.proteins, exp)
def test_find_integron_proteins_n_union_integrase(self): replicon_name = 'OBAL001.B.00005.C001' replicon_id = 'OBAL001.B.00005.C001' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_name + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) result_dir = 'Results_Integron_Finder_{}.union'.format(replicon_name) attc_file = self.find_data( os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data( os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data( os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.evalue_attc = 1. args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.calin_threshold = 2 args.attc_model = 'attc_4.cm' args.no_proteins = False args.keep_palindromes = True args.union_integrases = True args.gembase = False # needed by read_hmm which is called when no_proteins == False args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 3 complete integron(s) found with a total 4 attC site(s) - 0 CALIN element(s) found with a total of 0 attC site(s) - 2 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 5) integron = integrons[0] self.assertEqual(integron.replicon.name, replicon_id) empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype) exp_int = [] exp_int.append( pd.DataFrame([[ 418072, 419283, 1, 5.400000e-25, 'protein', 'Phage_integrase', np.nan, 'intI' ]], columns=self.columns, index=['OBAL001.B.00005.C001_388' ]).astype(dtype=self.dtype)) exp_int.append( pd.DataFrame([[ 434671, 440118, -1, 0.085, 'protein', 'Phage_integrase', np.nan, 'intI' ]], columns=self.columns, index=['OBAL001.B.00005.C001_399' ]).astype(dtype=self.dtype)) exp_int.append( pd.DataFrame([[ 516941, 517834, -1, 1.200000e-54, 'protein', 'Phage_integrase', np.nan, 'intI' ]], columns=self.columns, index=['OBAL001.B.00005.C001_472' ]).astype(dtype=self.dtype)) exp_int.append( pd.DataFrame([[ 1940269, 1941171, 1, 4.200000e-43, 'protein', 'Phage_integrase', np.nan, 'intI' ]], columns=self.columns, index=['OBAL001.B.00005.C001_1793' ]).astype(dtype=self.dtype)) exp_int.append( pd.DataFrame([[ 1545830, 1546807, -1, 1.100000e-21, 'protein', 'intersection_tyr_intI', np.nan, 'intI' ]], columns=self.columns, index=['OBAL001.B.00005.C001_1416' ]).astype(dtype=self.dtype)) exp_attC = [] exp_attC.append( pd.DataFrame( [[421689, 421764, 1, 0.13, 'attC', 'attc_4', np.nan, 'attC']], columns=self.columns, index=['attc_001']).astype(dtype=self.dtype)) exp_attC.append( pd.DataFrame([[ 442458, 442514, -1, 7.000000e-07, 'attC', 'attc_4', np.nan, 'attC' ]], columns=self.columns, index=['attc_001']).astype(dtype=self.dtype)) exp_attC.append(empty) exp_attC.append(empty) exp_attC.append( pd.DataFrame([[ 1547800, 1547859, 1, 0.00049, 'attC', 'attc_4', np.nan, 'attC' ], [1548775, 1548834, 1, 0.00009, 'attC', 'attc_4', 916.0, 'attC'] ], columns=self.columns, index=['attc_001', 'attc_002']).astype(dtype=self.dtype)) for i, integron in enumerate(integrons): self.assertEqual(integron.replicon.name, replicon_id) pdt.assert_frame_equal(integron.integrase, exp_int[i]) pdt.assert_frame_equal(integron.attC, exp_attC[i]) pdt.assert_frame_equal(integron.promoter, empty) pdt.assert_frame_equal(integron.attI, empty) pdt.assert_frame_equal(integron.proteins, empty)
def test_FastaIterator(self): file_name = 'multi_fasta' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) topologies = Topology('lin') with utils.FastaIterator(replicon_path) as seq_db: seq_db.topologies = topologies received_seq_id = sorted([seq.id for seq in seq_db]) expected_seq_id = sorted( ['ACBA.007.P01_13', 'LIAN.001.C02_10', 'PSSU.001.C01_13']) self.assertListEqual(expected_seq_id, received_seq_id) self.assertEqual(len(seq_db), 3) expected_seq_name = expected_seq_id with utils.FastaIterator(replicon_path) as seq_db: seq_db.topologies = topologies received_seq_name = sorted([seq.name for seq in seq_db]) self.assertListEqual(expected_seq_name, received_seq_name) replicon_name = 'foo' with utils.FastaIterator(replicon_path, replicon_name=replicon_name) as seq_db: seq_db.topologies = topologies received_seq_id = set([seq.name for seq in seq_db]) expected_seq_name = set([replicon_name]) self.assertSetEqual(expected_seq_name, received_seq_id) with utils.FastaIterator(replicon_path) as seq_db: received_seq_top = [seq.topology for seq in seq_db] expected_seq_top = ['lin', 'lin', 'lin'] self.assertListEqual(expected_seq_top, received_seq_top) topologies_data = { 'ACBA.007.P01_13': 'lin', 'LIAN.001.C02_10': 'circ', 'PSSU.001.C01_13': 'lin', } with tempfile.NamedTemporaryFile(mode='w') as topology_file: for rep, topo in topologies_data.items(): topology_file.write("{} {}\n".format(rep, topo)) topology_file.flush() topologies = Topology('lin', topology_file=topology_file.name) with utils.FastaIterator(replicon_path) as seq_db: seq_db.topologies = topologies received_seq_top = {seq.id: seq.topology for seq in seq_db} self.assertDictEqual(topologies_data, received_seq_top) file_name = 'acba_short' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) topologies = Topology('circ') with utils.FastaIterator(replicon_path) as seq_db: seq_db.topologies = topologies received_seq_top = [seq.topology for seq in seq_db] expected_seq_top = ['lin'] self.assertListEqual(expected_seq_top, received_seq_top) file_name = 'replicon_ambiguous_char' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) with utils.FastaIterator(replicon_path) as seq_db: received_seq_id = sorted([seq.id for seq in seq_db if seq]) expected_seq_id = sorted(['seq_1', 'seq_2', 'seq_3', 'seq_4']) self.assertListEqual(expected_seq_id, received_seq_id) file_name = 'replicon_bad_char' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) expected_warning = """sequence seq_(3|4) contains invalid characters, the sequence is skipped. sequence seq_(3|4) contains invalid characters, the sequence is skipped.""" with utils.FastaIterator(replicon_path) as seq_db: # 2 sequences are rejected so 2 message is produced (for seq 3 and seq 4) with self.catch_log() as log: received_seq_id = sorted([seq.id for seq in seq_db if seq]) got_warning = log.get_value().strip() self.assertRegex(got_warning, expected_warning) expected_seq_id = sorted(['seq_1', 'seq_2']) self.assertListEqual(expected_seq_id, received_seq_id) file_name = 'replicon_too_short' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) expected_warning = """sequence seq_(4|2) is too short \(32 bp\), the sequence is skipped \(must be > 50bp\). sequence seq_(4|2) is too short \(32 bp\), the sequence is skipped \(must be > 50bp\).""" with utils.FastaIterator(replicon_path) as seq_db: # 2 sequences are rejected so 2 messages are produced (for seq 2 & 4) with self.catch_log() as log: received_seq_id = sorted([seq.id for seq in seq_db if seq]) got_warning = log.get_value().strip() self.assertRegex(got_warning, expected_warning) expected_seq_id = sorted(['seq_1', 'seq_3']) self.assertListEqual(expected_seq_id, received_seq_id)
def main(args=None, loglevel=None): """ main entry point to integron_finder :param str args: the arguments passed on the command line :param loglevel: the output verbosity :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL' """ global _log args = sys.argv[1:] if args is None else args config = parse_args(args) ################################### # Prepare directories for results # ################################### # need to create directory before to init logger # as we write log in integron_finder.out in this dir if not os.path.exists(config.outdir): os.mkdir(config.outdir) else: if not os.path.isdir(config.outdir): msg = "outdir '{}' already exists and is not a directory".format( config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise IsADirectoryError(msg) if not os.path.exists(config.result_dir): os.mkdir(config.result_dir) else: if not os.path.isdir(config.result_dir): msg = "result dir '{}' already exists and is not a directory".format( config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise IsADirectoryError(msg) elif not os.access(config.result_dir, os.W_OK): msg = "result dir '{}' already exists and is not writable".format( config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise PermissionError(msg) #################### # init the loggers # #################### log_file = os.path.join(config.result_dir, 'integron_finder.out') integron_finder.init_logger(log_file=log_file, out=not config.mute) _log = colorlog.getLogger('integron_finder') if not loglevel: # logs are specify from args options logger_set_level(config.log_level) else: # used by unit tests to mute or unmute logs logger_set_level(loglevel) ####################################### # do last config check before running # ####################################### if config.cmsearch is None: msg = """cannot find 'cmsearch' in PATH. Please install infernal package or setup 'cmsearch' binary path with --cmsearch option""" _log.critical(msg) raise RuntimeError(msg) if config.hmmsearch is None: msg = """cannot find 'hmmsearch' in PATH. Please install hmmer package or setup 'hmmsearch' binary path with --hmmsearch option""" _log.critical(msg) raise RuntimeError(msg) if config.prodigal is None: msg = """cannot find 'prodigal' in PATH. Please install prodigal package or setup 'prodigal' binary path with --prodigal option""" _log.critical(msg) raise RuntimeError(msg) ################ # print Header # ################ log_header = colorlog.getLogger('integron_finder.header') logging = colorlog.logging.logging handlers = [] header_log_file = logging.FileHandler(log_file) handlers.append(header_log_file) if not config.mute: header_stream = colorlog.StreamHandler(sys.stdout) handlers.append(header_stream) formatter = colorlog.ColoredFormatter("%(message)s") for h in handlers: h.setFormatter(formatter) log_header.addHandler(h) log_header.setLevel(colorlog.logging.logging.INFO) log_header.propagate = False log_header.info(header(args)) with utils.FastaIterator( config.input_seq_path, dist_threshold=config.distance_threshold) as sequences_db: ################ # set topology # ################ default_topology = 'circ' if len(sequences_db) == 1 else 'lin' if config.linear: default_topology = 'lin' elif config.circular: default_topology = 'circ' # the both options are mutually exclusive topologies = Topology(default_topology, topology_file=config.topology_file) # allow sequences_db to inject topology information # in seq.topology attribute sequences_db.topologies = topologies ############## # do the job # ############## sequences_db_len = len(sequences_db) all_integrons = [] all_summaries = [] for rep_no, replicon in enumerate(sequences_db, 1): # if replicon contains illegal characters # or replicon is too short < 50 bp # then replicon is None if replicon is not None: _log.info( "############ Processing replicon {} ({}/{}) ############\n" .format(replicon.id, rep_no, sequences_db_len)) integron_res, summary = find_integron_in_one_replicon( replicon, config) if integron_res: all_integrons.append(integron_res) if summary: all_summaries.append(summary) else: _log.warning( "############ Skipping replicon {}/{} ############".format( rep_no, sequences_db_len)) if not config.split_results: _log.info("Merging integrons results.\n") agg_integrons = results.merge_results(*all_integrons) agg_summary = results.merge_results(*all_summaries) outfile_base_name = os.path.join( config.result_dir, utils.get_name_from_path(config.input_seq_path)) merged_integron_file = outfile_base_name + ".integrons" if not agg_integrons.empty: agg_integrons.to_csv(merged_integron_file, sep="\t", index=False, na_rep="NA") else: with open(merged_integron_file, "w") as out_f: out_f.write("# No Integron found\n") merged_summary_file = outfile_base_name + ".summary" if not agg_integrons.empty: agg_summary.to_csv(merged_summary_file, sep="\t", index=False, na_rep="NA", columns=[ 'ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN' ]) for _file in all_integrons + all_summaries: if _file != merged_integron_file and _file != merged_summary_file: # in special case where the merged file has the same name that a replicon result file os.unlink(_file)