Ejemplo n.º 1
0
 def test_filter_evalue_thres(self):
     """
     Test that the filter by a maximum attc size works.
     """
     filename = self.find_data(
         os.path.join(
             "Results_Integron_Finder_{}".format(self.replicon_name),
             "tmp_{}".format(self.replicon_id),
             "{}_attc_table.res".format(self.replicon_id)))
     df = infernal.read_infernal(filename,
                                 self.replicon_id,
                                 self.length_cm,
                                 evalue=1e-8)
     expect = pd.DataFrame(columns=[
         "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg",
         "pos_end", "sens", "evalue"
     ])
     expect = expect.append(
         {
             "Accession_number": self.replicon_id,
             "cm_attC": "attC_4",
             "cm_debut": 1,
             "cm_fin": 47,
             "pos_beg": 17825,
             "pos_end": 17884,
             "sens": "-",
             "evalue": 1e-9
         },
         ignore_index=True)
     expect = expect.astype(self.dtype)
     pdt.assert_frame_equal(df, expect)
Ejemplo n.º 2
0
 def test_no_total_cm_match_strandm(self):
     """
     Test that when the model did not completely match on the sequence,
     the start and end positions of hit are well recalculated. All hits are on strand -
     """
     filename = self.find_data(
         os.path.join("fictive_results", "{}_attc_table-partialm.res".format(self.replicon_id)))
     df = infernal.read_infernal(filename, self.replicon_id, self.length_cm)
     expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut",
                                    "cm_fin", "pos_beg", "pos_end", "sens", "evalue"])
     expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4",
                             "cm_debut": 1, "cm_fin": 40, "pos_beg": 17818,
                             "pos_end": 17884, "sens": "-", "evalue": 1e-9},
                            ignore_index=True)
     expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4",
                             "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080,
                             "pos_end": 19149, "sens": "-", "evalue": 1e-4},
                            ignore_index=True)
     expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4",
                             "cm_debut": 10, "cm_fin": 47, "pos_beg": 19618,
                             "pos_end": 19735, "sens": "-", "evalue": 1.1e-7},
                            ignore_index=True)
     # convert positions to int
     intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"]
     expect[intcols] = expect[intcols].astype(int)
     pdt.assert_frame_equal(df, expect)
Ejemplo n.º 3
0
 def test_generate_df(self):
     """
     Test that if the infernal file exists and there are hits, it returns the
     dataframe corresponding to it.
     """
     filename = self.find_data(os.path.join("Results_Integron_Finder_{}".format(self.replicon_name),
                                            "tmp_{}".format(self.replicon_id),
                                            "{}_attc_table.res".format(self.replicon_id)))
     df = infernal.read_infernal(filename, self.replicon_id, self.length_cm)
     expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut",
                                    "cm_fin", "pos_beg", "pos_end", "sens", "evalue"])
     expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4",
                             "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825,
                             "pos_end": 17884, "sens": "-", "evalue": 1e-9},
                            ignore_index=True)
     expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4",
                             "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080,
                             "pos_end": 19149, "sens": "-", "evalue": 1e-4},
                            ignore_index=True)
     expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4",
                             "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618,
                             "pos_end": 19726, "sens": "-", "evalue": 1.1e-7},
                            ignore_index=True)
     # convert positions to int
     intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"]
     expect[intcols] = expect[intcols].astype(int)
     pdt.assert_frame_equal(df, expect)
Ejemplo n.º 4
0
 def test_nohit(self):
     """
     Test that if the infernal file exists but there is no hit
     inside, it returns an empty dataframe.
     """
     filename = self.find_data(os.path.join("fictive_results", "{}_attc_table-empty.res".format(self.replicon_id)))
     df = infernal.read_infernal(filename, self.replicon_id, self.length_cm)
     expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut",
                                    "cm_fin", "pos_beg", "pos_end", "sens", "evalue"])
     pdt.assert_frame_equal(df, expect)
Ejemplo n.º 5
0
    def test_nofile(self):
        """
        Test that the function returns an empty dataframe if the given infernal file does not
        exist.

        """
        filename = "infernal.txt"
        df = infernal.read_infernal(filename, self.replicon_id, self.length_cm)
        expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut",
                                       "cm_fin", "pos_beg", "pos_end", "sens", "evalue"])
        pdt.assert_frame_equal(df, expect)
Ejemplo n.º 6
0
 def test_evalue_thres(self):
     """
     Test that if the infernal file exists and there are hits, but
     the given evalue threshold is smaller than the hits thresholds:
     no hit kept, should return an empty dataframe.
     """
     filename = self.find_data(os.path.join("Results_Integron_Finder_{}".format(self.replicon_name),
                                            "tmp_{}".format(self.replicon_id),
                                            "{}_attc_table.res".format(self.replicon_id)))
     df = infernal.read_infernal(filename, self.replicon_id, self.length_cm, evalue=1e-10)
     expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut",
                                    "cm_fin", "pos_beg", "pos_end", "sens", "evalue"])
     pdt.assert_frame_equal(df, expect)
 def test_search_attc_empty(self):
     """
     Test that when there are no attC sites detected, the attc array is empty.
     """
     attc_file = self.find_data(
         os.path.join("fictive_results",
                      self.replicon_id + "_attc_table-empty.res"))
     # Construct attC dataframe (read from infernal file)
     attc_df = infernal.read_infernal(attc_file, self.replicon_id,
                                      self.length_cm)
     attc_array = attc.search_attc(attc_df, True, self.dist_threshold,
                                   self.replicon_size)
     self.assertEqual(len(attc_array), 0)
     attc_res = []
     self.assertEqual(attc_array, attc_res)
 def test_filter_evalue_thres(self):
     """
     Test that the filter by a maximum attc size works.
     """
     filename = self.find_data(os.path.join("Results_Integron_Finder_{}".format(self.replicon_name),
                                            "tmp_{}".format(self.replicon_id),
                                            "{}_attc_table.res".format(self.replicon_id)))
     df = infernal.read_infernal(filename, self.replicon_id, self.length_cm, evalue=1e-8)
     expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut",
                                    "cm_fin", "pos_beg", "pos_end", "sens", "evalue"])
     expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4",
                             "cm_debut": 1, "cm_fin": 47, "pos_beg": 17825,
                             "pos_end": 17884, "sens": "-", "evalue": 1e-9},
                            ignore_index=True)
     # convert positions to int
     intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"]
     expect[intcols] = expect[intcols].astype(int)
     pdt.assert_frame_equal(df, expect)
Ejemplo n.º 9
0
 def test_attcsize_minthres(self):
     """
     Test that the filter by a minimum attc size works.
     """
     filename = self.find_data(os.path.join("Results_Integron_Finder_{}".format(self.replicon_name),
                                            "tmp_{}".format(self.replicon_id),
                                            "{}_attc_table.res".format(self.replicon_id)))
     df = infernal.read_infernal(filename, self.replicon_id, self.length_cm, size_min_attc=60)
     expect = pd.DataFrame(columns=["Accession_number", "cm_attC", "cm_debut",
                                    "cm_fin", "pos_beg", "pos_end", "sens", "evalue"])
     expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4",
                             "cm_debut": 1, "cm_fin": 47, "pos_beg": 19080,
                             "pos_end": 19149, "sens": "-", "evalue": 1e-4},
                            ignore_index=True)
     expect = expect.append({"Accession_number": self.replicon_id, "cm_attC": "attC_4",
                             "cm_debut": 1, "cm_fin": 47, "pos_beg": 19618,
                             "pos_end": 19726, "sens": "-", "evalue": 1.1e-7},
                            ignore_index=True)
     # convert positions to int
     intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"]
     expect[intcols] = expect[intcols].astype(int)
     pdt.assert_frame_equal(df, expect)
    def test_search_attc_uniq(self):
        """
        Test that it finds a unique attc array when giving a table with 3 attC sites
        on the same strand and separated by less than 4kb each.
        """
        attc_file = self.find_data(
            os.path.join(
                "Results_Integron_Finder_{}".format(self.replicon_name),
                "tmp_{}".format(self.replicon_id),
                "{}_attc_table.res".format(self.replicon_id)))
        # Construct attC dataframe (read from infernal file)
        attc_df = infernal.read_infernal(attc_file, self.replicon_id,
                                         self.length_cm)
        # search attC arrays, keeping palindromes
        # 2 attc sites are in the same array if they are on the same strand, and separated by
        # a distance less than 4kb

        attc_array = attc.search_attc(attc_df, True, self.dist_threshold,
                                      self.replicon_size)
        self.assertEqual(len(attc_array), 1)

        # Construct expected output:
        attc_res = pd.DataFrame(columns=[
            "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg",
            "pos_end", "sens", "evalue"
        ],
                                dtype='int')
        attc_res = attc_res.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 17825,
                "pos_end": 17884,
                "sens": "-",
                "evalue": 1e-9
            },
            ignore_index=True)
        attc_res = attc_res.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 19080,
                "pos_end": 19149,
                "sens": "-",
                "evalue": 1e-4
            },
            ignore_index=True)
        attc_res = attc_res.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 19618,
                "pos_end": 19726,
                "sens": "-",
                "evalue": 1.1e-7
            },
            ignore_index=True)
        # convert positions to int
        intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"]
        attc_res[intcols] = attc_res[intcols].astype(int)
        pdt.assert_frame_equal(attc_res, attc_array[0])
    def test_search_attc_dist_diff_strand(self):
        """
        Test that it finds a size 3 attc array when giving a table with:
        - 3 attC sites on the same strand (-) and separated by less than 4 kb
        - 2 other attC sites separated by less than 4kb but on the other strand (+)
        - 1 other attC site , also on strand +, but separated by more than 4kb.
        """
        attc_file = self.find_data(
            os.path.join(
                "Results_Integron_Finder_{}".format(self.replicon_name),
                "tmp_{}".format(self.replicon_id),
                "{}_attc_table.res".format(self.replicon_id)))
        # Construct attC dataframe (read from infernal file)
        attc_df = infernal.read_infernal(attc_file, self.replicon_id,
                                         self.length_cm)
        # Add another attC at more than 4kb, same strand
        attc_df = attc_df.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 15800,
                "pos_end": 16000,
                "sens": "+",
                "evalue": 1e-3
            },
            ignore_index=True)
        attc_df = attc_df.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 12000,
                "pos_end": 12500,
                "sens": "+",
                "evalue": 1e-3
            },
            ignore_index=True)
        attc_df = attc_df.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 7100,
                "pos_end": 8200,
                "sens": "+",
                "evalue": 1e-3
            },
            ignore_index=True)
        attc_df.sort_values(["Accession_number", "pos_beg", "evalue"],
                            inplace=True)
        # search attC arrays, keeping palindromes
        # 2 attc sites are in the same array if they are on the same strand, and separated by
        # a distance less than 4kb
        attc_array = attc.search_attc(attc_df, True, self.dist_threshold,
                                      self.replicon_size)
        self.assertEqual(len(attc_array), 3)

        # Construct expected outputs:
        attc_res = pd.DataFrame(columns=[
            "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg",
            "pos_end", "sens", "evalue"
        ])
        attc_res = attc_res.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 17825,
                "pos_end": 17884,
                "sens": "-",
                "evalue": 1e-9
            },
            ignore_index=True)
        attc_res = attc_res.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 19080,
                "pos_end": 19149,
                "sens": "-",
                "evalue": 1e-4
            },
            ignore_index=True)
        attc_res = attc_res.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 19618,
                "pos_end": 19726,
                "sens": "-",
                "evalue": 1.1e-7
            },
            ignore_index=True)
        attc_res2 = pd.DataFrame(columns=[
            "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg",
            "pos_end", "sens", "evalue"
        ])
        attc_res2 = attc_res2.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 12000,
                "pos_end": 12500,
                "sens": "+",
                "evalue": 1e-03
            },
            ignore_index=True)
        attc_res2 = attc_res2.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 15800,
                "pos_end": 16000,
                "sens": "+",
                "evalue": 1e-03
            },
            ignore_index=True)
        attc_res3 = pd.DataFrame(columns=[
            "Accession_number", "cm_attC", "cm_debut", "cm_fin", "pos_beg",
            "pos_end", "sens", "evalue"
        ])
        attc_res3 = attc_res3.append(
            {
                "Accession_number": self.replicon_id,
                "cm_attC": "attC_4",
                "cm_debut": 1,
                "cm_fin": 47,
                "pos_beg": 7100,
                "pos_end": 8200,
                "sens": "+",
                "evalue": 1e-03
            },
            ignore_index=True)
        # convert positions to int
        intcols = ["cm_debut", "cm_fin", "pos_beg", "pos_end"]
        attc_res[intcols] = attc_res[intcols].astype(int)
        attc_res2[intcols] = attc_res2[intcols].astype(int)
        attc_res3[intcols] = attc_res3[intcols].astype(int)
        pdt.assert_frame_equal(attc_res, attc_array[2])
        pdt.assert_frame_equal(attc_res2, attc_array[1])
        pdt.assert_frame_equal(attc_res3, attc_array[0])
    def test_find_integron_attC_is_df(self):
        replicon_name = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(
            os.path.join('Proteins', replicon_id + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        attc_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_attc_table.res'.format(replicon.id)))

        intI_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(
            os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                         'tmp_{}'.format(replicon.id),
                         '{}_phage_int.res'.format(replicon.id)))

        args = argparse.Namespace()
        args.no_proteins = True
        args.keep_palindromes = True
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000
        args.calin_threshold = 2
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        len_model_attc = 47  # length in 'CLEN' (value for model attc_4.cm)

        attc_file = read_infernal(attc_file,
                                  replicon_name,
                                  len_model_attc,
                                  evalue=cfg.evalue_attc,
                                  size_max_attc=cfg.max_attc_size,
                                  size_min_attc=cfg.min_attc_size)
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        exp_msg = """In replicon {}, there are:
- 0 complete integron(s) found with a total 0 attC site(s)
- 1 CALIN element(s) found with a total of 3 attC site(s)
- 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon, prot_db, attc_file, intI_file,
                                      phageI_file, cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)

        self.assertEqual(len(integrons), 1)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        exp = pd.DataFrame(
            {
                'annotation': ['attC'] * 3,
                'distance_2attC': [np.nan, 1196.0, 469.0],
                'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07],
                'model': ['attc_4'] * 3,
                'pos_beg': [17825, 19080, 19618],
                'pos_end': [17884, 19149, 19726],
                'strand': [-1, -1, -1],
                'type_elt': 'attC'
            },
            columns=self.columns,
            index=['attc_001', 'attc_002', 'attc_003'])
        pdt.assert_frame_equal(integron.attC, exp)

        exp = pd.DataFrame(columns=self.columns)
        exp = exp.astype(dtype=self.dtype)

        pdt.assert_frame_equal(integron.integrase, exp)
        pdt.assert_frame_equal(integron.promoter, exp)
        pdt.assert_frame_equal(integron.attI, exp)
        pdt.assert_frame_equal(integron.proteins, exp)
    def test_find_integron_attC_is_df(self):
        replicon_name = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        prot_file = self.find_data(os.path.join('Proteins', replicon_id + '.prt'))
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)
        attc_file = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                                                'tmp_{}'.format(replicon.id),
                                                '{}_attc_table.res'.format(replicon.id)))

        intI_file = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                                                'tmp_{}'.format(replicon.id),
                                                '{}_intI.res'.format(replicon.id)))
        phageI_file = self.find_data(os.path.join('Results_Integron_Finder_{}'.format(replicon_name),
                                                  'tmp_{}'.format(replicon.id),
                                                  '{}_phage_int.res'.format(replicon.id)))

        args = argparse.Namespace()
        args.no_proteins = True
        args.keep_palindromes = True
        args.attc_model = 'attc_4.cm'
        args.evalue_attc = 1.0
        args.max_attc_size = 200
        args.min_attc_size = 40
        args.distance_threshold = 4000
        args.calin_threshold = 2
        args.local_max = False
        cfg = Config(args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        len_model_attc = 47  # length in 'CLEN' (value for model attc_4.cm)

        attc_file = read_infernal(attc_file, replicon_name,
                                  len_model_attc,
                                  evalue=cfg.evalue_attc,
                                  size_max_attc=cfg.max_attc_size,
                                  size_min_attc=cfg.min_attc_size)
        prot_db = ProdigalDB(replicon, cfg, prot_file=prot_file)

        exp_msg = """In replicon {}, there are:
- 0 complete integron(s) found with a total 0 attC site(s)
- 1 CALIN element(s) found with a total of 3 attC site(s)
- 0 In0 element(s) found with a total of 0 attC site""".format(replicon.id)
        with self.catch_log() as log:
            integrons = find_integron(replicon,
                                      prot_db,
                                      attc_file,
                                      intI_file,
                                      phageI_file,
                                      cfg)
            catch_msg = log.get_value().strip()
        self.assertEqual(catch_msg, exp_msg)

        self.assertEqual(len(integrons), 1)
        integron = integrons[0]
        self.assertEqual(integron.replicon.name, replicon_id)

        exp = pd.DataFrame({'annotation': ['attC'] * 3,
                            'distance_2attC': [np.nan, 1196.0, 469.0],
                            'evalue': [1.000000e-09, 1.000000e-04, 1.100000e-07],
                            'model': ['attc_4'] * 3,
                            'pos_beg': [17825, 19080, 19618],
                            'pos_end': [17884, 19149, 19726],
                            'strand': [-1, -1, -1],
                            'type_elt': 'attC'},
        columns=self.columns,
        index=['attc_001', 'attc_002', 'attc_003'])
        pdt.assert_frame_equal(integron.attC, exp)

        exp = pd.DataFrame(columns=self.columns)
        exp = exp.astype(dtype=self.dtype)

        pdt.assert_frame_equal(integron.integrase, exp)
        pdt.assert_frame_equal(integron.promoter, exp)
        pdt.assert_frame_equal(integron.attI, exp)
        pdt.assert_frame_equal(integron.proteins, exp)