def test_search_attc_empty(self): """ Test that when there are no attC sites detected, the attc array is empty. """ attc_file = self.find_data( os.path.join("fictive_results", self.replicon_id + "_attc_table-empty.res")) # Construct attC dataframe (read from infernal file) attc_df = infernal.read_infernal(attc_file, self.replicon_id, self.length_cm) attc_array = attc.search_attc(attc_df, True, self.dist_threshold, self.replicon_size) self.assertEqual(len(attc_array), 0) attc_res = [] self.assertEqual(attc_array, attc_res)
def test_search_attc_drop_pal_break(self): """ If there is 1 palindrome attC, check that it keeps the one with the highest evalue, and that clusters are then found according to it. """ attc_df = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ], dtype='int') attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 1000, "pos_end": 2000, "sens": "-", "evalue": 1e-9 }, ignore_index=True) attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 3000, "pos_end": 4000, "sens": "-", "evalue": 1e-4 }, ignore_index=True) attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 3000, "pos_end": 4000, "sens": "+", "evalue": 1e-9 }, ignore_index=True) attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 5500, "pos_end": 7000, "sens": "-", "evalue": 1.1e-7 }, ignore_index=True) intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] attc_df[intcols] = attc_df[intcols].astype(int) attc_array = attc.search_attc(attc_df, False, self.dist_threshold, self.replicon_size) self.assertEqual(len(attc_array), 3) # Construct expected outputs: columns = [ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ] attc_res = pd.DataFrame(data={ "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 1000, "pos_end": 2000, "sens": "-", "evalue": 1e-9 }, index=[0]) attc_res = attc_res[columns] attc_res2 = pd.DataFrame(data={ "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 3000, "pos_end": 4000, "sens": "+", "evalue": 1e-9 }, index=[0]) attc_res2 = attc_res2[columns] attc_res3 = pd.DataFrame(data={ "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 5500, "pos_end": 7000, "sens": "-", "evalue": 1.1e-7 }, index=[0]) attc_res3 = attc_res3[columns] attc_res[intcols] = attc_res[intcols].astype(int) attc_res2[intcols] = attc_res2[intcols].astype(int) attc_res3[intcols] = attc_res3[intcols].astype(int) pdt.assert_frame_equal(attc_res2, attc_array[0]) pdt.assert_frame_equal(attc_res, attc_array[1]) pdt.assert_frame_equal(attc_res3, attc_array[2])
def test_search_attc_uniq(self): """ Test that it finds a unique attc array when giving a table with 3 attC sites on the same strand and separated by less than 4kb each. """ attc_file = self.find_data( os.path.join( "Results_Integron_Finder_{}".format(self.replicon_name), "tmp_{}".format(self.replicon_id), "{}_attc_table.res".format(self.replicon_id))) # Construct attC dataframe (read from infernal file) attc_df = infernal.read_infernal(attc_file, self.replicon_id, self.length_cm) # search attC arrays, keeping palindromes # 2 attc sites are in the same array if they are on the same strand, and separated by # a distance less than 4kb attc_array = attc.search_attc(attc_df, True, self.dist_threshold, self.replicon_size) self.assertEqual(len(attc_array), 1) # Construct expected output: attc_res = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ], dtype='int') attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9 }, ignore_index=True) attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4 }, ignore_index=True) attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19726, "sens": "-", "evalue": 1.1e-7 }, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] attc_res[intcols] = attc_res[intcols].astype(int) pdt.assert_frame_equal(attc_res, attc_array[0])
def test_search_attc_uniq_circ_plus(self): """ Test that it finds a unique attc array when giving a table with: - 2 attC sites at the begining of the genome, separated by less than 4kb - 2 attC sites at the end of the genome, separated by less than 4kb from the 1st attC site of the genome if we take into account its circularity. All 3 attC sites are on the same strand + """ attc_df = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ], dtype='int') attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 1000, "pos_end": 2000, "sens": "+", "evalue": 1e-9 }, ignore_index=True) attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 3000, "pos_end": 4000, "sens": "+", "evalue": 1e-4 }, ignore_index=True) attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 16000, "pos_end": 17000, "sens": "+", "evalue": 1.1e-7 }, ignore_index=True) attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19815, "pos_end": 20000, "sens": "+", "evalue": 1.1e-7 }, ignore_index=True) intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] attc_df[intcols] = attc_df[intcols].astype(int) attc_array = attc.search_attc(attc_df, True, self.dist_threshold, self.replicon_size) self.assertEqual(len(attc_array), 1) # Output of search_attc is ordered as the cluster is: # - 2 last attC of genome # - 2 first attC of genomes # Whereas input was ordered by begin position. Reorder attc_df to compare with output. attc_df = attc_df.reindex([2, 3, 0, 1]) attc_df.reset_index(inplace=True, drop=True) pdt.assert_frame_equal(attc_df, attc_array[0])
def test_search_attc_dist_diff_strand(self): """ Test that it finds a size 3 attc array when giving a table with: - 3 attC sites on the same strand (-) and separated by less than 4 kb - 2 other attC sites separated by less than 4kb but on the other strand (+) - 1 other attC site , also on strand +, but separated by more than 4kb. """ attc_file = self.find_data( os.path.join( "Results_Integron_Finder_{}".format(self.replicon_name), "tmp_{}".format(self.replicon_id), "{}_attc_table.res".format(self.replicon_id))) # Construct attC dataframe (read from infernal file) attc_df = infernal.read_infernal(attc_file, self.replicon_id, self.length_cm) # Add another attC at more than 4kb, same strand attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 15800, "pos_end": 16000, "sens": "+", "evalue": 1e-3 }, ignore_index=True) attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 12000, "pos_end": 12500, "sens": "+", "evalue": 1e-3 }, ignore_index=True) attc_df = attc_df.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 7100, "pos_end": 8200, "sens": "+", "evalue": 1e-3 }, ignore_index=True) attc_df.sort_values(["Accession_number", "pos_beg", "evalue"], inplace=True) # search attC arrays, keeping palindromes # 2 attc sites are in the same array if they are on the same strand, and separated by # a distance less than 4kb attc_array = attc.search_attc(attc_df, True, self.dist_threshold, self.replicon_size) self.assertEqual(len(attc_array), 3) # Construct expected outputs: attc_res = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825, "pos_end": 17884, "sens": "-", "evalue": 1e-9 }, ignore_index=True) attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080, "pos_end": 19149, "sens": "-", "evalue": 1e-4 }, ignore_index=True) attc_res = attc_res.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618, "pos_end": 19726, "sens": "-", "evalue": 1.1e-7 }, ignore_index=True) attc_res2 = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) attc_res2 = attc_res2.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 12000, "pos_end": 12500, "sens": "+", "evalue": 1e-03 }, ignore_index=True) attc_res2 = attc_res2.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 15800, "pos_end": 16000, "sens": "+", "evalue": 1e-03 }, ignore_index=True) attc_res3 = pd.DataFrame(columns=[ "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg", "pos_end", "sens", "evalue" ]) attc_res3 = attc_res3.append( { "Accession_number": self.replicon_id, "cm_attC": "attC_4", "cm_debut": 1, "cm_fin": 47, "pos_beg": 7100, "pos_end": 8200, "sens": "+", "evalue": 1e-03 }, ignore_index=True) # convert positions to int intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"] attc_res[intcols] = attc_res[intcols].astype(int) attc_res2[intcols] = attc_res2[intcols].astype(int) attc_res3[intcols] = attc_res3[intcols].astype(int) pdt.assert_frame_equal(attc_res, attc_array[2]) pdt.assert_frame_equal(attc_res2, attc_array[1]) pdt.assert_frame_equal(attc_res3, attc_array[0])