def test_annot_in0(self): """ Test func_annot when the integron is a in0: only an integrase. There are no proteins to annotate, but the _subseqprot.tmp file already exists (not deleted in a last run for example...) """ # create empty _subseqprot.tmp file (must be deleted by func_annot) open(os.path.join(self.tmp_dir, "{}_subseqprot.tmp".format(self.replicon.id)), "w").close() # Create integron integron1 = Integron(self.replicon, self.cfg) integrons = [integron1] # Add integrase integron1.add_integrase(55, 1014, "ACBA.007.P01_13_1", 1, 1.9e-25, "intersection_tyr_intI") # check proteins before annotation proteins = pd.DataFrame(columns=["pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation"]) proteins = proteins.astype(dtype={"pos_beg": "int", "pos_end": "int", "strand": "int", "evalue": "float", "type_elt": "str", "model": "str", "distance_2attC": "float", "annotation": "str"}) pdt.assert_frame_equal(proteins, integron1.proteins) # Annotate proteins func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir) # Check that all files generated are as expected files_created = [f for f in glob.glob(os.path.join(self.tmp_dir, "*")) if os.path.isfile(f)] exp_files = ["{}{}".format(self.replicon.id, suffix) for suffix in ("_intI_table.res", "_phage_int_table.res", "_intI.res", "_phage_int.res")] exp_files = [os.path.join(self.tmp_dir, file) for file in exp_files] self.assertEqual(set(exp_files), set(files_created)) # check proteins after annotation pdt.assert_frame_equal(proteins, integron1.proteins)
def test_find_attc_max_In0(self): replicon_name = 'ESCO001.B.00018.P002' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) integron = Integron(replicon, self.cfg) integrase = pd.DataFrame({'pos_beg': [90229], 'pos_end': [91242], 'strand': -1, 'evalue': 1.400000e-24, 'type_elt': 'protein', 'annotation': 'intI', 'model': 'intersection_tyr_intI', 'distance_2attC': np.nan }, index=['ESCO001.B.00018.P002_106'], columns=self.columns) integrase = integrase.astype(dtype=self.dtype) integron.integrase = integrase integrons = [integron] max_final = find_attc_max(integrons, replicon, self.cfg.distance_threshold, self.cfg.model_attc_path, self.cfg.max_attc_size, self.cfg.min_attc_size, circular=True, out_dir=self.tmp_dir) exp = pd.DataFrame(columns=self.max_cols) exp = exp.astype(dtype=self.max_dtype) pdt.assert_frame_equal(max_final, exp)
def test_annot_calin_empty(self): """ Test func_annot when the integron is a CALIN (attC but no integrase), without any protein: nothing to annotate """ # Create integron integron1 = Integron(self.replicon, self.cfg) integrons = [integron1] # Add only attc sites (no integrase) integron1.add_attC(17825, 17884, -1, 7e-9, "attc_4") integron1.add_attC(19080, 19149, -1, 7e-4, "attc_4") integron1.add_attC(19618, 19726, -1, 7e-7, "attc_4") # check proteins before annotation proteins = pd.DataFrame(columns=["pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation"]) proteins = proteins.astype(dtype={"pos_beg": "int", "pos_end": "int", "strand": "int", "evalue": "float", "type_elt": "str", "model": "str", "distance_2attC": "float", "annotation": "str"}) pdt.assert_frame_equal(proteins, integron1.proteins) # Annotate proteins func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir) # Check that all files generated are as expected files_created = [f for f in glob.glob(os.path.join(self.tmp_dir, "*")) if os.path.isfile(f)] exp_files = ["{}{}".format(self.replicon.id, suffix) for suffix in ("_intI_table.res", "_phage_int_table.res", "_intI.res", "_phage_int.res")] exp_files = [os.path.join(self.tmp_dir, file) for file in exp_files] self.assertEqual(set(exp_files), set(files_created)) # Check proteins after annotation pdt.assert_frame_equal(proteins, integron1.proteins)
def test_annot_calin(self): """ Test func_annot when the integron is a CALIN (attC but no integrase), with 4 proteins: for 3 of them resfam annotations are found, and not for the last 1. """ # Create integron integron1 = Integron(self.replicon, self.cfg) integrons = [integron1] # Add only attc sites (no integrase) integron1.add_attC(17825, 17884, -1, 7e-9, "attc_4") integron1.add_attC(19080, 19149, -1, 7e-4, "attc_4") integron1.add_attC(19618, 19726, -1, 7e-7, "attc_4") # Add proteins between attC sites integron1.add_proteins(self.prot_db) # Check that proteins dataframe is as expected before annotation proteins = pd.DataFrame({"pos_beg": [17375, 17886, 19090, 19721], "pos_end": [17722, 18665, 19749, 20254], "strand": [-1] * 4, "evalue": [np.nan] * 4, "type_elt": ["protein"] * 4, "model": ["NA"] * 4, "distance_2attC": [np.nan] * 4, "annotation": ["protein"] * 4}, index=["ACBA.007.P01_13_20", "ACBA.007.P01_13_21", "ACBA.007.P01_13_22", "ACBA.007.P01_13_23"]) proteins = proteins[["pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation"]] # we need to sort the dataframe # as protein file is parse using biopython and index # the order os sequences is not guarantee pdt.assert_frame_equal(proteins.sort_index(), integron1.proteins.sort_index()) # Annotate proteins func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir) # Check that all files generated are as expected files_created = [f for f in glob.glob(os.path.join(self.tmp_dir, "*")) if os.path.isfile(f)] self.assertEqual(set(self.exp_files), set(files_created)) # Check that annotated proteins are as expected proteins.loc["ACBA.007.P01_13_20"] = [17375, 17722, -1, 4.5e-31, "protein", "RF0066", np.nan, "emrE"] proteins.loc["ACBA.007.P01_13_21"] = [17886, 18665, -1, 7.4e-168, "protein", "RF0027", np.nan, "ANT3"] proteins.loc["ACBA.007.P01_13_23"] = [19721, 20254, -1, 6.2e-110, "protein", "RF0003", np.nan, "AAC3-I"] # we need to sort the dataframe # as protein file is parse using biopython and index # the order os sequences is not guarantee pdt.assert_frame_equal(proteins.sort_index(), integron1.proteins.sort_index())
def test_annot_wrong_hmmsearch(self): """ Test that when the given HMMSEARCH command does not exist, it raises an exception specifying that the given command could not run. """ self.cfg._args.hmmsearch = "nimportnaoik" # Create integron integron1 = Integron(self.replicon.name, self.cfg) integrons = [integron1] # Add only attc sites (no integrase) integron1.add_attC(17825, 17884, -1, 7e-9, "attc_4") integron1.add_attC(19080, 19149, -1, 7e-4, "attc_4") integron1.add_attC(19618, 19726, -1, 7e-7, "attc_4") # Add proteins between attC sites integron1.add_proteins(self.prot_db) # Annotate proteins with self.assertRaises(RuntimeError) as ctx: func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir) self.assertTrue(re.search("failed : \[Errno 2\] No such file or directory: 'nimportnaoik'", str(ctx.exception)))
def test_annot_wrong_hmm(self): """ Test that when the given hmm file does not exist, it returns an error specifying that the hmm command ended with a non-zero return code. """ wrong_hmm_files = ["myhmm.hmm"] # Create integron integron1 = Integron(self.replicon, self.cfg) integrons = [integron1] # Add only attc sites (no integrase) integron1.add_attC(17825, 17884, -1, 7e-9, "attc_4") integron1.add_attC(19080, 19149, -1, 7e-4, "attc_4") integron1.add_attC(19618, 19726, -1, 7e-7, "attc_4") # Add proteins between attC sites integron1.add_proteins(self.prot_db) # Annotate proteins with self.assertRaises(RuntimeError) as ctx: func_annot(integrons, self.replicon, self.prot_db, wrong_hmm_files, self.cfg, self.tmp_dir) self.assertTrue(str(ctx.exception).endswith(" failed return code = 1"))
def test_add_integrase(self): replicon_name = "acba.007.p01.13" replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) data_integrase = {"pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.900000e-25, "type_elt": "protein", "annotation": "intI", "model": "intersection_tyr_intI", "distance_2attC": np.nan} id_int = "ACBA.007.P01_13_1" df = pd.DataFrame(data_integrase, columns=self.columns, index=[id_int]) df = df.astype(dtype=self.dtype) integron = Integron(replicon, self.cfg) integron.add_integrase(data_integrase["pos_beg"], data_integrase["pos_end"], id_int, data_integrase["strand"], data_integrase["evalue"], data_integrase["model"] ) pdt.assert_frame_equal(df, integron.integrase) with self.assertRaises(RuntimeError) as ctx: integron.add_integrase(data_integrase["pos_beg"], data_integrase["pos_end"], id_int, data_integrase["strand"], data_integrase["evalue"], data_integrase["model"] ) self.assertEqual(str(ctx.exception), "add_integrase should be called once.")
def test_integrons_report(self): replicon_name = "acba.007.p01.13" replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) args = argparse.Namespace() cfg = Config(args) cfg._args.eagle_eyes = False cfg._args.eagle_eyes = False cfg._args.local_max = False integron = Integron(replicon, cfg) columns = ['pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation'] dtype = {"pos_beg": 'int', "pos_end": 'int', "strand": 'int', "evalue": 'float', "type_elt": 'str', "annotation": 'str', "model": 'str', "distance_2attC": 'float'} data_integrase = {"pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.900000e-25, "type_elt": "protein", "annotation": "intI", "model": "intersection_tyr_intI", "distance_2attC": np.nan} id_int = "ACBA.007.P01_13_1" integrase = pd.DataFrame(data_integrase, columns=columns, index=[id_int]) integrase = integrase.astype(dtype=dtype) data_attc = {"pos_beg": [17825, 19080, 19618], "pos_end": [17884, 19149, 19726], "strand": [-1] * 3, "evalue": [1.000000e-09, 1.000000e-04, 1.100000e-07], "type_elt": ["attC"] * 3, "annotation": ["attC"] * 3, "model": ["attc_4"] * 3, "distance_2attC": [np.nan, 1196.0, 469.0]} attC = pd.DataFrame(data_attc, columns=columns, index=['attc_00{}'.format(i) for i in range(1, 4)]) attC = attC.astype(dtype=dtype) promoter = pd.DataFrame({'pos_beg': 25, 'pos_end': 51, 'strand': -1, 'evalue': np.nan, 'type_elt': 'Promoter', 'annotation': 'Pc_1', 'model': np.nan, 'distance_2attC': np.nan }, index=['Pc_int1'], columns=columns ) promoter = promoter.astype(dtype=dtype) proteins = pd.DataFrame({'pos_beg': [17375, 17886, 19090, 19721], 'pos_end': [17722, 18665, 19749, 20254], 'strand': [-1] * 4, 'evalue': [np.nan] * 4, 'type_elt': ['protein'] * 4, 'annotation': ['protein'] * 4, 'model': [np.nan] * 4, 'distance_2attC': [np.nan] * 4 }, index=['ACBA.007.P01_13_2{}'.format(i) for i in range(0, 4)], columns=columns ) proteins = proteins.astype(dtype=dtype) integron.integrase = integrase integron.attC = attC integron.promoter = promoter integron.proteins = proteins report = results.integrons_report([integron]) exp_report = pd.read_csv( self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name), '{}.integrons'.format(replicon_name) )), sep="\t" ) exp_report = exp_report.astype(dtype=dtype) pdt.assert_frame_equal(exp_report, report)
def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..'))) self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder') if os.path.exists(self.tmp_dir) and os.path.isdir(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.makedirs(self.tmp_dir) args = argparse.Namespace() args.attc_model = 'attc_4.cm' args.max_attc_size = 200 args.min_attc_size = 40 args.distance_threshold = 4000 # (4kb at least between 2 different arrays) args.eagle_eyes = False args.local_max = False self.cfg = Config(args) self.cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data') replicon_name = 'OBAL001.B.00005.C001' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies self.replicon = next(sequences_db) self.integron = Integron(self.replicon, self.cfg) self.columns = [ 'pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation' ] self.dtype = { "pos_beg": 'int', "pos_end": 'int', "strand": 'int', "evalue": 'float', "type_elt": 'str', "annotation": 'str', "model": 'str', "distance_2attC": 'float' } self.max_dtype = { 'Accession_number': 'str', 'cm_attC': 'str', 'cm_debut': 'int', 'cm_fin': 'int', 'pos_beg': 'int', 'pos_end': 'int', 'sens': 'str', 'evalue': 'float' } self.max_cols = [ 'Accession_number', 'cm_attC', 'cm_debut', 'cm_fin', 'pos_beg', 'pos_end', 'sens', 'evalue' ]
def test_integrons_report(self): replicon_name = "acba.007.p01.13" replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('circ') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) args = argparse.Namespace() cfg = Config(args) cfg._args.eagle_eyes = False cfg._args.eagle_eyes = False cfg._args.local_max = False integron = Integron(replicon, cfg) columns = [ 'pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation' ] dtype = { "pos_beg": 'int', "pos_end": 'int', "strand": 'int', "evalue": 'float', "type_elt": 'str', "annotation": 'str', "model": 'str', "distance_2attC": 'float' } data_integrase = { "pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.900000e-25, "type_elt": "protein", "annotation": "intI", "model": "intersection_tyr_intI", "distance_2attC": np.nan } id_int = "ACBA.007.P01_13_1" integrase = pd.DataFrame(data_integrase, columns=columns, index=[id_int]) integrase = integrase.astype(dtype=dtype) data_attc = { "pos_beg": [17825, 19080, 19618], "pos_end": [17884, 19149, 19726], "strand": [-1] * 3, "evalue": [1.000000e-09, 1.000000e-04, 1.100000e-07], "type_elt": ["attC"] * 3, "annotation": ["attC"] * 3, "model": ["attc_4"] * 3, "distance_2attC": [np.nan, 1196.0, 469.0] } attC = pd.DataFrame(data_attc, columns=columns, index=['attc_00{}'.format(i) for i in range(1, 4)]) attC = attC.astype(dtype=dtype) promoter = pd.DataFrame( { 'pos_beg': 25, 'pos_end': 51, 'strand': -1, 'evalue': np.nan, 'type_elt': 'Promoter', 'annotation': 'Pc_1', 'model': np.nan, 'distance_2attC': np.nan }, index=['Pc_int1'], columns=columns) promoter = promoter.astype(dtype=dtype) proteins = pd.DataFrame( { 'pos_beg': [17375, 17886, 19090, 19721], 'pos_end': [17722, 18665, 19749, 20254], 'strand': [-1] * 4, 'evalue': [np.nan] * 4, 'type_elt': ['protein'] * 4, 'annotation': ['protein'] * 4, 'model': [np.nan] * 4, 'distance_2attC': [np.nan] * 4 }, index=['ACBA.007.P01_13_2{}'.format(i) for i in range(0, 4)], columns=columns) proteins = proteins.astype(dtype=dtype) integron.integrase = integrase integron.attC = attC integron.promoter = promoter integron.proteins = proteins report = results.integrons_report([integron]) exp_report = pd.read_csv(self.find_data( os.path.join('Results_Integron_Finder_{}'.format(replicon_name), '{}.integrons'.format(replicon_name))), sep="\t") exp_report = exp_report.astype(dtype=dtype) pdt.assert_frame_equal(exp_report, report)
def test_describe(self): replicon_name = "acba.007.p01.13" replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) args = argparse.Namespace() args.eagle_eyes = False args.local_max = False cfg = Config(args) integron = Integron(replicon, cfg) data_integrase = { "pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.900000e-25, "type_elt": "protein", "annotation": "intI", "model": "intersection_tyr_intI", "distance_2attC": np.nan } id_int = "ACBA.007.P01_13_1" integrase = pd.DataFrame(data_integrase, columns=self.columns, index=[id_int]) integrase = integrase.astype(dtype=self.dtype) data_attc = { "pos_beg": 10, "pos_end": 100, "strand": -1, "evalue": 1.1e-07, "type_elt": "attC", "annotation": "attC", "model": "attc_4", "distance_2attC": np.nan } attC = pd.DataFrame(data_attc, columns=self.columns, index=['attc_001']) attC = attC.astype(dtype=self.dtype) promoter = pd.DataFrame(data_attc, columns=self.columns, index=['prom_001']) promoter = promoter.astype(dtype=self.dtype) attI = pd.DataFrame(data_attc, columns=self.columns, index=['attI_001']) attI = attI.astype(dtype=self.dtype) proteins = pd.DataFrame(data_attc, columns=self.columns, index=['prot_001']) proteins = proteins.astype(dtype=self.dtype) excp_description = pd.concat( [integrase, attC, promoter, attI, proteins], ignore_index=False) excp_description = excp_description.reset_index() excp_description.columns = ["element"] + list( excp_description.columns[1:]) excp_description["type"] = "complete" excp_description["ID_replicon"] = replicon.id excp_description["ID_integron"] = id( integron) # uniq identifier of a given Integron excp_description["default"] = "Yes" excp_description["considered_topology"] = replicon.topology excp_description.drop_duplicates(subset=["element"], inplace=True) self.cfg._args.eagle_eyes = False self.cfg._args.eagle_eyes = False integron.integrase = integrase integron.attC = attC integron.promoter = promoter integron.attI = attI integron.proteins = proteins recieved_description = integron.describe() pdt.assert_frame_equal(recieved_description, excp_description)
def test_add_proteins(self): replicon_name = 'pssu.001.c01.13' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) prot_file = os.path.join(self._data_dir, '{}.prt.short'.format(replicon_name)) args = argparse.Namespace() args.gembase = False args.annot_parser_name = None cfg = Config(args) integron = Integron(replicon, cfg) data_attc = { "pos_beg": [3072863, 3073496, 3074121, 3075059, 3075593, 3076281, 3076659], "pos_end": [3072931, 3073555, 3074232, 3075118, 3075652, 3076340, 3076718], "strand": [-1] * 7, "evalue": [2.5e-06, 7e-08, 6.5e-08, 3.2e-06, 4.1e-07, 1.4e-08, 4e-08], "type_elt": ['attC'] * 7, "annotation": ['attC'] * 7, "model": ['attc_4'] * 7, "distance_2attC": [np.nan, 565.0, 566.0, 827.0, 475.0, 629.0, 319.0] } attC = pd.DataFrame(data_attc, columns=self.columns, index=[ 'attc_00{}'.format(i) for i in range(len(data_attc['pos_beg'])) ]) attC = attC.astype(dtype=self.dtype) integron.attC = attC prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file) integron.add_proteins(prot_db) exp_proteins = pd.DataFrame( { 'pos_beg': [3071974, 3072950, 3074243, 3076720], 'pos_end': [3072855, 3073468, 3075055, 3077511], 'strand': [-1] * 4, 'evalue': [np.nan] * 4, 'type_elt': ['protein'] * 4, 'annotation': ['protein'] * 4, 'model': ['NA'] * 4, 'distance_2attC': [np.nan] * 4 }, index=['PSSU.001.C01_13_281{}'.format(i) for i in range(5, 9)], columns=self.columns) exp_proteins = exp_proteins.astype(dtype=self.dtype) pdt.assert_frame_equal(exp_proteins.sort_index(), integron.proteins.sort_index())
def test_attI(self): replicon_name = 'saen.040.p01.10' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) attC = pd.DataFrame( { 'pos_beg': [104651, 105162, 106018, 107567, 108423, 108743], 'pos_end': [104710, 105221, 106087, 107626, 108482, 108832], 'strand': [-1] * 6, 'evalue': [ 3.400000e-06, 7.500000e-09, 6.800000e-06, 2.800000e-07, 6.600000e-06, 1.800000e-04 ], 'type_elt': ['attC'] * 6, 'annotation': ['attC'] * 6, 'model': ['attc_4'] * 6, 'distance_2attC': [np.nan, 452.0, 797.0, 1480.0, 797.0, 261.0] }, index=['attc_00{}'.format(i) for i in range(1, 7)], columns=self.columns) attC = attC.astype(dtype=self.dtype) integrase = pd.DataFrame( { 'pos_beg': 109469, 'pos_end': 110482, 'strand': 1, 'evalue': 1.600000e-24, 'type_elt': 'protein', 'annotation': 'intI', 'model': 'intersection_tyr_intI', 'distance_2attC': np.nan }, index=['SAEN.040.P01_10_135'], columns=self.columns) integrase = integrase.astype(dtype=self.dtype) ########################################## # test promoter with attC with integrase # ########################################## integron = Integron(replicon, self.cfg) integron.attC = attC integron.integrase = integrase exp_attI = pd.DataFrame( { 'pos_beg': [109330], 'pos_end': [109388], 'strand': [-1], 'evalue': [np.nan], 'type_elt': 'attI', 'annotation': 'attI_1', 'model': 'NA', 'distance_2attC': [np.nan] }, index=['attI1'], columns=self.columns) exp_attI = exp_attI.astype(dtype=self.dtype) integron.add_attI() pdt.assert_frame_equal(exp_attI, integron.attI) ############################################# # test promoter with attC without integrase # ############################################# integron = Integron(replicon, self.cfg) integron.attC = attC empty_attI = pd.DataFrame(columns=self.columns) empty_attI = empty_attI.astype(dtype=self.dtype) integron.add_attI() pdt.assert_frame_equal(empty_attI, integron.attI) ############################################# # test promoter without attC with integrase # ############################################# integron = Integron(replicon, self.cfg) integron.integrase = integrase integron.add_attI() pdt.assert_frame_equal(exp_attI, integron.attI)
def test_add_promoter(self): replicon_name = 'saen.040.p01.10' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies replicon = next(sequences_db) ## integron_finder.SIZE_REPLICON = 148711 prot_file = os.path.join(self._data_dir, 'Proteins', '{}.prt'.format(replicon_name)) # to test promoter we need to ad attC and integrase first # as add_promoter use attc and integrase attC = pd.DataFrame( { 'pos_beg': [104651, 105162, 106018, 107567, 108423, 108743], 'pos_end': [104710, 105221, 106087, 107626, 108482, 108832], 'strand': [-1] * 6, 'evalue': [ 3.400000e-06, 7.500000e-09, 6.800000e-06, 2.800000e-07, 6.600000e-06, 1.800000e-04 ], 'type_elt': ['attC'] * 6, 'annotation': ['attC'] * 6, 'model': ['attc_4'] * 6, 'distance_2attC': [np.nan, 452.0, 797.0, 1480.0, 797.0, 261.0] }, index=['attc_00{}'.format(i) for i in range(1, 7)], columns=self.columns) attC = attC.astype(dtype=self.dtype) integrase = pd.DataFrame( { 'pos_beg': 109469, 'pos_end': 110482, 'strand': 1, 'evalue': 1.600000e-24, 'type_elt': 'protein', 'annotation': 'intI', 'model': 'intersection_tyr_intI', 'distance_2attC': np.nan }, index=['SAEN.040.P01_10_135'], columns=self.columns) integrase = integrase.astype(dtype=self.dtype) ########################################## # test promoter with attC with integrase # ########################################## integron = Integron(replicon, self.cfg) integron.attC = attC integron.integrase = integrase integron.add_promoter() exp_promoters = pd.DataFrame( { 'pos_beg': [109413, 109439], 'pos_end': [109447, 109465], 'strand': [1, -1], 'evalue': [np.nan] * 2, 'type_elt': ['Promoter'] * 2, 'annotation': ['Pint_1', 'Pc_1'], 'model': ['NA'] * 2, 'distance_2attC': [np.nan] * 2 }, index=['P_intI1', 'Pc_int1'], columns=self.columns) exp_promoters = exp_promoters.astype(dtype=self.dtype) pdt.assert_frame_equal(exp_promoters, integron.promoter) ############################################# # test promoter with attC without integrase # ############################################# integron = Integron(replicon, self.cfg) integron.attC = attC integron.add_promoter() empty_promoter = pd.DataFrame(columns=self.columns) empty_promoter = empty_promoter.astype(dtype=self.dtype) pdt.assert_frame_equal(empty_promoter, integron.promoter) ############################################# # test promoter without attC with integrase # ############################################# integron = Integron(replicon, self.cfg) integron.integrase = integrase integron.add_promoter() pdt.assert_frame_equal(exp_promoters, integron.promoter)
def test_type(self): replicon = SeqRecord(Seq.Seq(''), id='foo') no_integrase = Integron(replicon, self.cfg) self.assertIsNone(no_integrase.type()) replicon = SeqRecord(Seq.Seq(''), id='just_one_integrase') just_one_integrase = Integron(replicon, self.cfg) just_one_integrase.add_integrase(10, 100, 'foo', 1, 1e-2, "intersection_tyr_intI") self.assertEqual(just_one_integrase.type(), "In0") replicon = SeqRecord(Seq.Seq(''), id='just_one_attC') just_one_attC = Integron(replicon, self.cfg) just_one_attC.add_attC(10, 100, 1, 1e-2, "intersection_tyr_intI") self.assertEqual(just_one_attC.type(), "CALIN") replicon = SeqRecord(Seq.Seq(''), id='one_integrase_one_attC') one_integrase_one_attC = Integron(replicon, self.cfg) one_integrase_one_attC.add_integrase(10, 100, 'foo', 1, 1e-2, "intersection_tyr_intI") one_integrase_one_attC.add_attC(10, 100, 1, 1e-2, "intersection_tyr_intI") self.assertEqual(one_integrase_one_attC.type(), "complete")
def test_annot_multi(self): """ Test func_annot when there are 4 integrons: - 1 calin with 4 proteins, 2 having a resfam annotation - 1 calin with 2 proteins, none having a resfam annotation - 1 in0 - 1 complete with 4 proteins, 3 having a resfam annotation """ # resfam pour: 16, 13, 3, 12 # Create integron in0 integron1 = Integron(self.replicon.name, self.cfg) integron1.add_integrase(56, 1014, "ACBA.007.P01_13_1", 1, 1.9e-25, "intersection_tyr_intI") # Create integron CALIN with resfam proteins integron2 = Integron(self.replicon, self.cfg) integron2.add_attC(7400, 7650, -1, 7e-9, "attc_4") integron2.add_attC(8600, 8650, -1, 7e-4, "attc_4") integron2.add_attC(10200, 10400, -1, 7e-7, "attc_4") integron2.add_attC(10800, 10900, -1, 7e-7, "attc_4") integron2.add_proteins(self.prot_db) # Create integron CALIN without any resfam proteins integron3 = Integron(self.replicon, self.cfg) integron3.add_attC(4320, 4400, -1, 7e-9, "attc_4") integron3.add_proteins(self.prot_db) # Create complete integron integron4 = Integron(self.replicon, self.cfg) integron4.add_attC(17825, 17884, -1, 7e-9, "attc_4") integron4.add_attC(19080, 19149, -1, 7e-4, "attc_4") integron4.add_attC(19618, 19726, -1, 7e-7, "attc_4") integron4.add_integrase(16542, 17381, "ACBA.007.P01_13_19", -1, 1.9e-25, "intersection_tyr_intI") integron4.add_proteins(self.prot_db) integrons = [integron1, integron2, integron3, integron4] # Create dataframes for expected proteins before annotation proteins1 = pd.DataFrame(columns=["pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation"]) proteins1 = proteins1.astype(dtype={"pos_beg": "int", "pos_end": "int", "strand": "int", "evalue": "float", "type_elt": "str", "model": "str", "distance_2attC": "float", "annotation": "str"}) proteins1 = proteins1[["pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation"]] proteins1 = proteins1.astype(dtype=self.prot_dtype) proteins2 = pd.DataFrame({"pos_beg": [7088, 7710, 8650, 10524], "pos_end": [7351, 8594, 10125, 11699], "strand": [1, -1, -1, -1], "evalue": [np.nan] * 4, "type_elt": ["protein"] * 4, "model": ["NA"] * 4, "distance_2attC": [np.nan] * 4, "annotation": ["protein"] * 4}, index=["ACBA.007.P01_13_11", "ACBA.007.P01_13_12", "ACBA.007.P01_13_13", "ACBA.007.P01_13_14"]) proteins2 = proteins2[["pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation"]] proteins2 = proteins2.astype(dtype=self.prot_dtype) proteins3 = pd.DataFrame({"pos_beg": [3546, 4380], "pos_end": [4313, 4721], "strand": [1, 1], "evalue": [np.nan] * 2, "type_elt": ["protein"] * 2, "model": ["NA"] * 2, "distance_2attC": [np.nan] * 2, "annotation": ["protein"] * 2}, index=["ACBA.007.P01_13_6", "ACBA.007.P01_13_7"]) proteins3 = proteins3[["pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation"]] proteins3 = proteins3.astype(dtype=self.prot_dtype) proteins4 = pd.DataFrame({"pos_beg": [17375, 17886, 19090, 19721], "pos_end": [17722, 18665, 19749, 20254], "strand": [-1] * 4, "evalue": [np.nan] * 4, "type_elt": ["protein"] * 4, "model": ["NA"] * 4, "distance_2attC": [np.nan] * 4, "annotation": ["protein"] * 4}, index=["ACBA.007.P01_13_20", "ACBA.007.P01_13_21", "ACBA.007.P01_13_22", "ACBA.007.P01_13_23"]) proteins4 = proteins4[["pos_beg", "pos_end", "strand", "evalue", "type_elt", "model", "distance_2attC", "annotation"]] proteins4 = proteins4.astype(dtype=self.prot_dtype) # Check proteins before annotation expected_proteins = [proteins1, proteins2, proteins3, proteins4] for inte, exp_prot in zip(integrons, expected_proteins): # we need to sort the dataframe # as protein file is parse using biopython and index # the order os sequences is not guarantee pdt.assert_frame_equal(inte.proteins.sort_index(), exp_prot.sort_index()) # Annotate proteins with evalue threshold func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir, evalue=1e-32) # Check that all files generated are as expected files_created = [f for f in glob.glob(os.path.join(self.tmp_dir, "*")) if os.path.isfile(f)] self.assertEqual(set(self.exp_files), set(files_created)) # Check that annotated proteins are as expected proteins2.loc["ACBA.007.P01_13_13"] = [8650, 10125, -1, 2.4e-86, "protein", "RF0007", np.nan, "ABC_efflux"] proteins4.loc["ACBA.007.P01_13_21"] = [17886, 18665, -1, 7.4e-168, "protein", "RF0027", np.nan, "ANT3"] proteins4.loc["ACBA.007.P01_13_23"] = [19721, 20254, -1, 6.2e-110, "protein", "RF0003", np.nan, "AAC3-I"] for inte, prots in zip(integrons, expected_proteins): # we need to sort the dataframe # as protein file is parse using biopython and index # the order os sequences is not guarantee pdt.assert_frame_equal(inte.proteins.sort_index(), prots.sort_index()) # Annotate proteins with default evalue (1 more annotation) with self.catch_io(out=True): func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir) proteins4.loc["ACBA.007.P01_13_20"] = [17375, 17722, -1, 4.5e-31, "protein", "RF0066", np.nan, "emrE"] for inte, prots in zip(integrons, expected_proteins): pdt.assert_frame_equal(inte.proteins.sort_index(), prots.sort_index()) # Annotate proteins with lower coverage threshold (1 more annotation) with self.catch_io(out=True): func_annot(integrons, self.replicon, self.prot_db, self.hmm_files, self.cfg, self.tmp_dir, coverage=0.4) proteins2.loc["ACBA.007.P01_13_12"] = [7710, 8594, -1, 1.6e-5, "protein", "RF0033", np.nan, "APH3"] for inte, prots in zip(integrons, expected_proteins): pdt.assert_frame_equal(inte.proteins.sort_index(), prots.sort_index())