def test_find_integron_calin_threshold(self): replicon_name = 'ESCO001.B.00018.P002' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_name + '.prt')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) replicon_results_path = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id))) attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id)) intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id)) phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id)) args = argparse.Namespace() args.no_proteins = False args.keep_palindromes = True args.distance_threshold = 4000 args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.local_max = False args.gembase = False args.union_integrases = False args.calin_threshold = 2 cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) self.assertEqual(len(integrons), 2) args.calin_threshold = 3 cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) self.assertEqual(len(integrons), 1)
def test_find_integrase_gembase(self): cfg = Config(self.args) self.args.gembase = True cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self.tmp_dir, replicon_name + ".prt") shutil.copyfile( self.find_data(os.path.join('Proteins', replicon.id + ".prt")), prot_file) integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res', '_phage_int_table.res'): res = os.path.join(self.tmp_dir, replicon.id + suffix) self.assertTrue(os.path.exists(res))
def test_find_integrase_no_gembase_no_protfile(self): try: cfg = Config(self.args) self.args.gembase = False cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 500000 prot_file = os.path.join(self.tmp_dir, "foo.prt") open(prot_file, 'w').close() with self.catch_log(): with self.assertRaises(EmptyFileError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) finally: replicon.__class__.__len__ = len_ori
def test_find_integrase_no_gembase_no_protfile_no_prodigal(self): try: self.args.hmmsearch = 'foo' self.args.gembase = False cfg = Config(self.args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 500000 prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") shutil.copyfile( self.find_data(os.path.join('Proteins', replicon.id + ".prt")), prot_file) with self.assertRaises(RuntimeError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertTrue( re.search( "failed : \[Errno 2\] No such file or directory: 'foo'", str(ctx.exception))) finally: replicon.__class__.__len__ = len_ori
def test_find_integrase_no_gembase_no_protfile_short_seq(self): try: cfg = Config(self.args) self.args.gembase = False cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_name = 'ACBA.007.P01_13' prot_path = self.find_data( os.path.join('Proteins', prot_name + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 200 prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") shutil.copyfile(prot_path, prot_file) integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res', '_phage_int_table.res'): res = os.path.join(self.tmp_dir, replicon.id + suffix) self.assertTrue(os.path.exists(res)) finally: replicon.__class__.__len__ = len_ori
def test_find_integrase_no_gembase_with_protfile_empty(self): try: cfg = Config(self.args) self.args.gembase = False cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) len_ori = replicon.__class__.__len__ replicon.__class__.__len__ = lambda x: 200 prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") open(prot_file, 'w').close() with self.assertRaises(EmptyFileError) as ctx: with self.catch_log(): integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertTrue( re.match( "^The protein file: '.*' is empty cannot perform hmmsearch on it.$", str(ctx.exception))) finally: replicon.__class__.__len__ = len_ori
def test_find_integron_calin_threshold(self): replicon_name = 'ESCO001.B.00018.P002' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data(os.path.join('Proteins', replicon_name + '.prt')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) replicon_results_path = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id))) attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id)) intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id)) phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id)) args = argparse.Namespace() args.no_proteins = False args.keep_palindromes = True args.distance_threshold = 4000 args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.local_max = False args.gembase = False args.union_integrases = False args.calin_threshold = 2 cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) self.assertEqual(len(integrons), 2) args.calin_threshold = 3 cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) self.assertEqual(len(integrons), 1)
def test_find_integrase_gembase_hmmer_error(self): self.args.gembase = True self.args.cpu = 'foo' cfg = Config(self.args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = os.path.join(self._data_dir, 'Replicons', replicon_name + '.fst') topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") shutil.copyfile( os.path.join(self._data_dir, 'Proteins', replicon.id + ".prt"), prot_file) with self.assertRaises(RuntimeError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertTrue(str(ctx.exception).endswith('failed return code = 1'))
def test_find_integrase_gembase_no_hmmer_no_replicon(self): self.args.gembase = True self.args.hmmsearch = 'foo' cfg = Config(self.args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'acba.007.p01.13' replicon_path = os.path.join(self._data_dir, 'Replicons', replicon_name + '.fst') topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt") with self.catch_log(): with self.assertRaises(RuntimeError) as ctx: integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg) self.assertEqual( "The protein file: '{}' does not exists cannot perform hmmsearch on it." .format(prot_file), str(ctx.exception))
def test_find_integron_proteins_n_union_integrase(self): replicon_name = 'OBAL001.B.00005.C001' replicon_id = 'OBAL001.B.00005.C001' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_name + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) result_dir = 'Results_Integron_Finder_{}.union'.format(replicon_name) attc_file = self.find_data( os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data( os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data( os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.evalue_attc = 1. args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.calin_threshold = 2 args.attc_model = 'attc_4.cm' args.no_proteins = False args.keep_palindromes = True args.union_integrases = True args.gembase = False # needed by read_hmm which is called when no_proteins == False args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 3 complete integron(s) found with a total 4 attC site(s) - 0 CALIN element(s) found with a total of 0 attC site(s) - 2 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 5) integron = integrons[0] self.assertEqual(integron.replicon.name, replicon_id) empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype) exp_int = [] exp_int.append( pd.DataFrame([[ 418072, 419283, 1, 5.400000e-25, 'protein', 'Phage_integrase', np.nan, 'intI' ]], columns=self.columns, index=['OBAL001.B.00005.C001_388' ]).astype(dtype=self.dtype)) exp_int.append( pd.DataFrame([[ 434671, 440118, -1, 0.085, 'protein', 'Phage_integrase', np.nan, 'intI' ]], columns=self.columns, index=['OBAL001.B.00005.C001_399' ]).astype(dtype=self.dtype)) exp_int.append( pd.DataFrame([[ 516941, 517834, -1, 1.200000e-54, 'protein', 'Phage_integrase', np.nan, 'intI' ]], columns=self.columns, index=['OBAL001.B.00005.C001_472' ]).astype(dtype=self.dtype)) exp_int.append( pd.DataFrame([[ 1940269, 1941171, 1, 4.200000e-43, 'protein', 'Phage_integrase', np.nan, 'intI' ]], columns=self.columns, index=['OBAL001.B.00005.C001_1793' ]).astype(dtype=self.dtype)) exp_int.append( pd.DataFrame([[ 1545830, 1546807, -1, 1.100000e-21, 'protein', 'intersection_tyr_intI', np.nan, 'intI' ]], columns=self.columns, index=['OBAL001.B.00005.C001_1416' ]).astype(dtype=self.dtype)) exp_attC = [] exp_attC.append( pd.DataFrame( [[421689, 421764, 1, 0.13, 'attC', 'attc_4', np.nan, 'attC']], columns=self.columns, index=['attc_001']).astype(dtype=self.dtype)) exp_attC.append( pd.DataFrame([[ 442458, 442514, -1, 7.000000e-07, 'attC', 'attc_4', np.nan, 'attC' ]], columns=self.columns, index=['attc_001']).astype(dtype=self.dtype)) exp_attC.append(empty) exp_attC.append(empty) exp_attC.append( pd.DataFrame([[ 1547800, 1547859, 1, 0.00049, 'attC', 'attc_4', np.nan, 'attC' ], [1548775, 1548834, 1, 0.00009, 'attC', 'attc_4', 916.0, 'attC'] ], columns=self.columns, index=['attc_001', 'attc_002']).astype(dtype=self.dtype)) for i, integron in enumerate(integrons): self.assertEqual(integron.replicon.name, replicon_id) pdt.assert_frame_equal(integron.integrase, exp_int[i]) pdt.assert_frame_equal(integron.attC, exp_attC[i]) pdt.assert_frame_equal(integron.promoter, empty) pdt.assert_frame_equal(integron.attI, empty) pdt.assert_frame_equal(integron.proteins, empty)
def test_find_integron_proteins_circ_replicon(self): replicon_name = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_id + '.prt')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) exp_result_dir = 'Results_Integron_Finder_acba.007.p01.13.circular' attc_file = self.find_data( os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data( os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data( os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.no_proteins = False args.keep_palindromes = True args.union_integrases = False args.gembase = False # needed by read_hmm which is called when no_proteins == False args = argparse.Namespace() args.evalue_attc = 1. args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.attc_model = 'attc_4.cm' args.no_proteins = False args.gembase = False # needed by read_hmm which is called when no_proteins == False args.union_integrases = False args.keep_palindromes = True args.calin_threshold = 2 args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 1 complete integron(s) found with a total 3 attC site(s) - 0 CALIN element(s) found with a total of 0 attC site(s) - 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 1) integron = integrons[0] self.assertEqual(integron.replicon.name, replicon_id) exp = pd.DataFrame( { 'annotation': 'intI', 'distance_2attC': np.nan, 'evalue': 1.900000e-25, 'model': 'intersection_tyr_intI', 'pos_beg': 55, 'pos_end': 1014, 'strand': 1, 'type_elt': 'protein' }, columns=self.columns, index=['ACBA.007.P01_13_1']) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.integrase, exp) exp = pd.DataFrame( { 'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC' }, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.attC, exp) exp = pd.DataFrame(columns=self.columns) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.promoter, exp) pdt.assert_frame_equal(integron.attI, exp) pdt.assert_frame_equal(integron.proteins, exp)
def test_find_integron_attC_is_df(self): replicon_name = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data( os.path.join('Proteins', replicon_id + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) attc_file = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.no_proteins = True args.keep_palindromes = True args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 args.calin_threshold = 2 args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') len_model_attc = 47 # length in 'CLEN' (value for model attc_4.cm) attc_file = read_infernal(attc_file, replicon_name, len_model_attc, evalue=cfg.evalue_attc, size_max_attc=cfg.max_attc_size, size_min_attc=cfg.min_attc_size) prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 0 complete integron(s) found with a total 0 attC site(s) - 1 CALIN element(s) found with a total of 3 attC site(s) - 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 1) integron = integrons[0] self.assertEqual(integron.replicon.name, replicon_id) exp = pd.DataFrame( { 'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC' }, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) pdt.assert_frame_equal(integron.attC, exp) exp = pd.DataFrame(columns=self.columns) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.integrase, exp) pdt.assert_frame_equal(integron.promoter, exp) pdt.assert_frame_equal(integron.attI, exp) pdt.assert_frame_equal(integron.proteins, exp)
def test_find_integron(self): replicon_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data(os.path.join('Proteins', prot_name + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) replicon_results_path = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name), 'tmp_{}'.format(replicon.id))) attc_file = os.path.join(replicon_results_path, '{}_attc_table.res'.format(replicon.id)) intI_file = os.path.join(replicon_results_path, '{}_intI.res'.format(replicon.id)) phageI_file = os.path.join(replicon_results_path, '{}_phage_int.res'.format(replicon.id)) args = argparse.Namespace() args.no_proteins = True args.keep_palindromes = True args.distance_threshold = 4000 args.attc_model = 'attc_4.cm' args.evalue_attc = 1.0 args.max_attc_size = 200 args.min_attc_size = 40 args.calin_threshold = 2 args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 0 complete integron(s) found with a total 0 attC site(s) - 1 CALIN element(s) found with a total of 3 attC site(s) - 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 1) integron = integrons[0] self.assertEqual(integron.replicon.id, replicon.id) exp = pd.DataFrame({'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC'}, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) pdt.assert_frame_equal(integron.attC, exp) exp = pd.DataFrame(columns=self.columns,) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.integrase, exp) pdt.assert_frame_equal(integron.promoter, exp) pdt.assert_frame_equal(integron.attI, exp) pdt.assert_frame_equal(integron.proteins, exp)
def test_find_integron_proteins_n_union_integrase(self): replicon_name = 'OBAL001.B.00005.C001' replicon_id = 'OBAL001.B.00005.C001' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data(os.path.join('Proteins', replicon_name + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) result_dir = 'Results_Integron_Finder_{}.union'.format(replicon_name) attc_file = self.find_data(os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data(os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data(os.path.join(result_dir, 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.evalue_attc = 1. args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.calin_threshold = 2 args.attc_model = 'attc_4.cm' args.no_proteins = False args.keep_palindromes = True args.union_integrases = True args.gembase = False # needed by read_hmm which is called when no_proteins == False args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 3 complete integron(s) found with a total 4 attC site(s) - 0 CALIN element(s) found with a total of 0 attC site(s) - 2 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 5) integron = integrons[0] self.assertEqual(integron.replicon.name, replicon_id) empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype) exp_int = [] exp_int.append(pd.DataFrame( [[418072, 419283, 1, 5.400000e-25, 'protein', 'Phage_integrase', np.nan, 'intI']], columns=self.columns, index=['OBAL001.B.00005.C001_388']).astype(dtype=self.dtype)) exp_int.append(pd.DataFrame( [[434671, 440118, -1, 0.085, 'protein', 'Phage_integrase', np.nan, 'intI']], columns=self.columns, index=['OBAL001.B.00005.C001_399']).astype(dtype=self.dtype)) exp_int.append(pd.DataFrame( [[516941, 517834, -1, 1.200000e-54, 'protein', 'Phage_integrase', np.nan, 'intI']], columns=self.columns, index=['OBAL001.B.00005.C001_472']).astype(dtype=self.dtype)) exp_int.append(pd.DataFrame( [[1940269, 1941171, 1, 4.200000e-43, 'protein', 'Phage_integrase', np.nan, 'intI']], columns=self.columns, index=['OBAL001.B.00005.C001_1793']).astype(dtype=self.dtype)) exp_int.append(pd.DataFrame( [[1545830, 1546807, -1, 1.100000e-21, 'protein', 'intersection_tyr_intI', np.nan, 'intI']], columns=self.columns, index=['OBAL001.B.00005.C001_1416']).astype(dtype=self.dtype)) exp_attC = [] exp_attC.append(pd.DataFrame( [[421689, 421764, 1, 0.13, 'attC', 'attc_4', np.nan, 'attC']], columns=self.columns, index=['attc_001']).astype(dtype=self.dtype)) exp_attC.append(pd.DataFrame( [[442458, 442514, -1, 7.000000e-07, 'attC', 'attc_4', np.nan, 'attC']], columns=self.columns, index=['attc_001']).astype(dtype=self.dtype)) exp_attC.append(empty) exp_attC.append(empty) exp_attC.append(pd.DataFrame( [[1547800, 1547859, 1, 0.00049, 'attC', 'attc_4', np.nan, 'attC'], [1548775, 1548834, 1, 0.00009, 'attC', 'attc_4', 916.0, 'attC']], columns=self.columns, index=['attc_001', 'attc_002']).astype(dtype=self.dtype)) for i, integron in enumerate(integrons): self.assertEqual(integron.replicon.name, replicon_id) pdt.assert_frame_equal(integron.integrase, exp_int[i]) pdt.assert_frame_equal(integron.attC, exp_attC[i]) pdt.assert_frame_equal(integron.promoter, empty) pdt.assert_frame_equal(integron.attI, empty) pdt.assert_frame_equal(integron.proteins, empty)
def test_find_integron_proteins_lin_replicon(self): replicon_name = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) prot_file = self.find_data(os.path.join('Proteins', replicon_id + '.prt')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) exp_result_dir = 'Results_Integron_Finder_acba.007.p01.13.linear' attc_file = self.find_data(os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_attc_table.res'.format(replicon.id))) intI_file = self.find_data(os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_intI.res'.format(replicon.id))) phageI_file = self.find_data(os.path.join(exp_result_dir, 'tmp_{}'.format(replicon.id), '{}_phage_int.res'.format(replicon.id))) args = argparse.Namespace() args.no_proteins = False args.keep_palindromes = True args.union_integrases = False args.gembase = False # needed by read_hmm which is called when no_proteins == False args = argparse.Namespace() args.evalue_attc = 1. args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.attc_model = 'attc_4.cm' args.no_proteins = False args.gembase = False # needed by read_hmm which is called when no_proteins == False args.union_integrases = False args.keep_palindromes = True args.calin_threshold = 2 args.local_max = False cfg = Config(args) cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) exp_msg = """In replicon {}, there are: - 0 complete integron(s) found with a total 0 attC site(s) - 1 CALIN element(s) found with a total of 3 attC site(s) - 1 In0 element(s) found with a total of 0 attC site""".format(replicon.id) with self.catch_log() as log: integrons = find_integron(replicon, prot_db, attc_file, intI_file, phageI_file, cfg) catch_msg = log.get_value().strip() self.assertEqual(catch_msg, exp_msg) self.assertEqual(len(integrons), 2) exp_int = [] exp = pd.DataFrame({'annotation': 'intI', 'distance_2attC': np.nan, 'evalue': 1.900000e-25, 'model': 'intersection_tyr_intI', 'pos_beg': 55, 'pos_end': 1014, 'strand': 1, 'type_elt': 'protein'}, columns=self.columns, index=['ACBA.007.P01_13_1']) exp = exp.astype(dtype=self.dtype) exp_int.append(exp) exp_int.append(pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)) exp_attC = [pd.DataFrame(columns=self.columns).astype(dtype=self.dtype)] exp = pd.DataFrame({'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC'}, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) exp = exp.astype(dtype=self.dtype) exp_attC.append(exp) empty = pd.DataFrame(columns=self.columns).astype(dtype=self.dtype) for i, integron in enumerate(integrons): self.assertEqual(integron.replicon.name, replicon_id) pdt.assert_frame_equal(integron.integrase, exp_int[i]) pdt.assert_frame_equal(integron.attC, exp_attC[i]) pdt.assert_frame_equal(integron.promoter, empty) pdt.assert_frame_equal(integron.attI, empty) pdt.assert_frame_equal(integron.proteins, empty)