def test_search_attc_uniq(self): """ Test that it finds a unique attc array when giving a table with 3 attC sites on the same strand and separated by less than 4kb each. """ attc_file = os.path.join("tests", "data", "Results_Integron_Finder_" + self.rep_name, "other", self.rep_name + "_attc_table.res") # Construct attC dataframe (read from infernal file) attc_df = integron_finder.read_infernal(attc_file) # search attC arrays, keeping palindromes # 2 attc sites are in the same array if they are on the same strand, and separated by # a distance less than 4kb attc_array = integron_finder.search_attc(attc_df, True) self.assertEqual(len(attc_array), 1) # Construct expected output: attc_res = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"], dtype='int') attc_res = attc_res.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9}, ignore_index=True) attc_res = attc_res.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4}, ignore_index=True) attc_res = attc_res.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19726, "sens": "-", "evalue": 1.1e-7}, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] attc_res[intcols] = attc_res[intcols].astype(int) pdt.assert_frame_equal(attc_res, attc_array[0])
def test_filter_evalue_thres(self): """ Test that the filter by a maximum attc size works. """ filename = os.path.join("tests", "data", "Results_Integron_Finder_" + self.rep_name, "other", self.rep_name + "_attc_table.res") df = integron_finder.read_infernal(filename, evalue=1e-8) expect = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) expect = expect.append( { "Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9 }, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] expect[intcols] = expect[intcols].astype(int) pdt.assert_frame_equal(df, expect)
def test_no_total_cm_match_strandm(self): """ Test that when the model did not completely match on the sequence, the start and end positions of hit are well recalculated. All hits are on strand - """ filename = os.path.join("tests", "data", "Results_Integron_Finder_" + self.rep_name, "other", self.rep_name + "_attc_table-partialm.res") df = integron_finder.read_infernal(filename) expect = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) expect = expect.append( { "Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 40, "pos_beg": 17818, "pos_end": 17884, "sens": "-", "evalue": 1e-9 }, ignore_index=True) expect = expect.append( { "Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4 }, ignore_index=True) expect = expect.append( { "Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 10, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19735, "sens": "-", "evalue": 1.1e-7 }, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] expect[intcols] = expect[intcols].astype(int) pdt.assert_frame_equal(df, expect)
def test_generate_df(self): """ Test that if the infernal file exists and there are hits, it returns the dataframe corresponding to it. """ filename = os.path.join("tests", "data", "Results_Integron_Finder_" + self.rep_name, "other", self.rep_name + "_attc_table.res") df = integron_finder.read_infernal(filename) expect = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) expect = expect.append( { "Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9 }, ignore_index=True) expect = expect.append( { "Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4 }, ignore_index=True) expect = expect.append( { "Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19726, "sens": "-", "evalue": 1.1e-7 }, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] expect[intcols] = expect[intcols].astype(int) pdt.assert_frame_equal(df, expect)
def test_search_attc_empty(self): """ Test that when there are no attC sites detected, the attc array is empty. """ attc_file = os.path.join("tests", "data", "Results_Integron_Finder_" + self.rep_name, "other", self.rep_name + "_attc_table-empty.res") # Construct attC dataframe (read from infernal file) attc_df = integron_finder.read_infernal(attc_file) attc_array = integron_finder.search_attc(attc_df, True) self.assertEqual(len(attc_array), 0) attc_res = [] self.assertEqual(attc_array, attc_res)
def test_nofile(self): """ Test that the function returns an empty dataframe if the given infernal file does not exist. """ filename = "infernal.txt" df = integron_finder.read_infernal(filename) expect = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) pdt.assert_frame_equal(df, expect)
def test_evalue_thres(self): """ Test that if the infernal file exists and there are hits, but the given evalue threshold is smaller than the hits thresholds: no hit kept, should return an empty dataframe. """ filename = os.path.join("tests", "data", "Results_Integron_Finder_" + self.rep_name, "other", self.rep_name + "_attc_table.res") df = integron_finder.read_infernal(filename, evalue=1e-10) expect = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) pdt.assert_frame_equal(df, expect)
def test_nohit(self): """ Test that if the infernal file exists but there is no hit inside, it returns an empty dataframe. """ filename = os.path.join("tests", "data", "Results_Integron_Finder_" + self.rep_name, "other", self.rep_name + "_attc_table-empty.res") df = integron_finder.read_infernal(filename) expect = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) pdt.assert_frame_equal(df, expect)
def test_find_integron_attC_is_df(self): replicon_name = 'acba.007.p01.13' replicon_path = os.path.join(self._data_dir, 'Replicons', replicon_name + '.fst') attc_file = os.path.join( self._data_dir, 'Results_Integron_Finder_acba.007.p01.13/other/acba.007.p01.13_attc_table.res' ) intI_file = os.path.join( self._data_dir, 'Results_Integron_Finder_acba.007.p01.13/other/acba.007.p01.13_intI.res' ) phageI_file = os.path.join( self._data_dir, 'Results_Integron_Finder_acba.007.p01.13/other/acba.007.p01.13_phage_int.res' ) args = argparse.Namespace args.no_proteins = True args.keep_palindromes = True integron_finder.replicon_name = replicon_name integron_finder.SEQUENCE = SeqIO.read( replicon_path, "fasta", alphabet=Seq.IUPAC.unambiguous_dna) integron_finder.SIZE_REPLICON = len(integron_finder.SEQUENCE) integron_finder.args = args integron_finder.evalue_attc = 1. integron_finder.max_attc_size = 200 integron_finder.min_attc_size = 40 integron_finder.length_cm = 47 # length in 'CLEN' (value for model attc_4.cm) integron_finder.DISTANCE_THRESHOLD = 4000 # (4kb at least between 2 different arrays) integron_finder.model_attc_name = integron_finder.MODEL_attc.split( "/")[-1].split(".cm")[0] attc_file = integron_finder.read_infernal( attc_file, evalue=integron_finder.evalue_attc, size_max_attc=integron_finder.max_attc_size, size_min_attc=integron_finder.min_attc_size) with self.catch_output() as (out, err): integrons = integron_finder.find_integron(replicon_name, attc_file, intI_file, phageI_file) self.assertEqual(err.getvalue().strip(), "") self.assertEqual( out.getvalue().strip(), """In replicon acba.007.p01.13, there are: - 0 complete integron(s) found with a total 0 attC site(s) - 1 CALIN element(s) found with a total of 3 attC site(s) - 0 In0 element(s) found with a total of 0 attC site""") self.assertEqual(len(integrons), 1) integron = integrons[0] self.assertEqual(integron.ID_replicon, replicon_name) exp = pd.DataFrame( { 'annotation': ['attC'] * 3, 'distance_2attC': [np.nan, 1196.0, 469.0], 'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07], 'model': ['attc_4'] * 3, 'pos_beg': [17825, 19080, 19618], 'pos_end': [17884, 19149, 19726], 'strand': [-1, -1, -1], 'type_elt': 'attC' }, columns=self.columns, index=['attc_001', 'attc_002', 'attc_003']) pdt.assert_frame_equal(integron.attC, exp) exp = pd.DataFrame(columns=self.columns) exp = exp.astype(dtype=self.dtype) pdt.assert_frame_equal(integron.integrase, exp) pdt.assert_frame_equal(integron.promoter, exp) pdt.assert_frame_equal(integron.attI, exp) pdt.assert_frame_equal(integron.proteins, exp)
def test_search_attc_dist_diff_strand(self): """ Test that it finds a size 3 attc array when giving a table with: - 3 attC sites on the same strand (-) and separated by less than 4 kb - 2 other attC sites separated by less than 4kb but on the other strand (+) - 1 other attC site , also on strand +, but separated by more than 4kb. """ attc_file = os.path.join("tests", "data", "Results_Integron_Finder_" + self.rep_name, "other", self.rep_name + "_attc_table.res") # Construct attC dataframe (read from infernal file) attc_df = integron_finder.read_infernal(attc_file) # Add another attC at more than 4kb, same strand attc_df = attc_df.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 15800, "pos_end": 16000, "sens": "+", "evalue": 1e-3}, ignore_index=True) attc_df = attc_df.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 12000, "pos_end": 12500, "sens": "+", "evalue": 1e-3}, ignore_index=True) attc_df = attc_df.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 7100, "pos_end": 8200, "sens": "+", "evalue": 1e-3}, ignore_index=True) attc_df.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True) # search attC arrays, keeping palindromes # 2 attc sites are in the same array if they are on the same strand, and separated by # a distance less than 4kb attc_array = integron_finder.search_attc(attc_df, True) self.assertEqual(len(attc_array), 3) # Construct expected outputs: attc_res = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"]) attc_res = attc_res.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9}, ignore_index=True) attc_res = attc_res.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4}, ignore_index=True) attc_res = attc_res.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19726, "sens": "-", "evalue": 1.1e-7}, ignore_index=True) attc_res2 = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"]) attc_res2 = attc_res2.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 12000, "pos_end": 12500, "sens": "+", "evalue": 1e-03}, ignore_index=True) attc_res2 = attc_res2.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 15800, "pos_end": 16000, "sens": "+", "evalue": 1e-03}, ignore_index=True) attc_res3 = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue"]) attc_res3 = attc_res3.append({"Accession_number": self.rep_name, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 7100, "pos_end": 8200, "sens": "+", "evalue": 1e-03}, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] attc_res[intcols] = attc_res[intcols].astype(int) attc_res2[intcols] = attc_res2[intcols].astype(int) attc_res3[intcols] = attc_res3[intcols].astype(int) pdt.assert_frame_equal(attc_res, attc_array[2]) pdt.assert_frame_equal(attc_res2, attc_array[1]) pdt.assert_frame_equal(attc_res3, attc_array[0])