def test_integron_1elem_int(self): """ Test add_feature when the only element is an integron composed of 1 integrase only. """ infos = {"ID_replicon": self.replicon_id, "ID_integron": "integron_01", "element": "ACBA.007.P01_13_1", "pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.9e-25, "type_elt": "protein", "annotation": "intI", "model": "intersection_tyr_intI", "type": "complete", "default": "Yes", "distance_2attC": np.nan } df = pd.DataFrame(infos, index=[0]) start_seq = self.seq.seq start_id = self.seq.id add_feature(self.seq, df, self.prot_db, self.dist_threshold) # Translation should be protein ACBA.007.P01_13_1 in # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt translate = ("MKTATAPLPPLRSVKVLDQLRERIRYLHYSLRTEQAYVNWVRAFIRFHGVRHPATLGSSE" "VEAFLSWLANERKVSVSTHRQALAALLFFYGKVLCTDLPWLQEIGRPRPSRRLPVVLTPD" "EVVRILGFLEGEHRLFAQLLYGTGMRISEGLQLRVKDLDFDHGTIIVREGKGSKDRALML" "PESLAPSLREQLSRARAWWLKDQAEGRSGVALPDALERKYPRAGHSWPWFWVFAQHTHST" "DPRSGVVRRHHMYDQTFQRAFKRAVEGTVAKLAMRQPFVLFKGLTFQKLCLPGAFRPGDH" "HNKMLRPGLCVVHASPQYL*") # Check that there are 2 features (integron and protein) self.assertEqual(len(self.seq.features), 2) # Check that initial sequence and id are not modified self.assertEqual(self.seq.seq, start_seq) self.assertEqual(self.seq.id, start_id) # Check first feature: integron self.assertEqual(self.seq.features[0].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[0].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[0].strand, 0) self.assertEqual(self.seq.features[0].type, "integron") self.assertEqual(self.seq.features[0].qualifiers["integron_id"], infos["ID_integron"]) self.assertEqual(self.seq.features[0].qualifiers["integron_type"], infos["type"]) # Check second feature: protein self.assertEqual(self.seq.features[1].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[1].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[1].strand, infos["strand"]) self.assertEqual(self.seq.features[1].type, "integrase") self.assertEqual(self.seq.features[1].qualifiers["protein_id"], infos["element"]) self.assertEqual(self.seq.features[1].qualifiers["gene"], infos["annotation"]) self.assertEqual(self.seq.features[1].qualifiers["model"], infos["model"]) self.assertEqual(str(self.seq.features[1].qualifiers["translation"]), translate)
def test_integron_1elem_prom(self): """ Test add_feature when the only element is an integron composed of 1 promoter only. """ infos = { "ID_replicon": self.replicon_id, "ID_integron": "integron_01", "element": "Pc_int1", "pos_beg": 25, "pos_end": 51, "strand": -1, "evalue": np.nan, "type_elt": "Promoter", "annotation": "Pc_1", "model": "NA", "type": "complete", "default": "Yes", "distance_2attC": np.nan } df = pd.DataFrame(infos, index=[0]) start_seq = self.seq.seq start_id = self.seq.id add_feature(self.seq, df, self.prot_file, self.dist_threshold) # Check that there are 2 features (integron and promoter) self.assertEqual(len(self.seq.features), 2) # Check that initial sequence and id are not modified self.assertEqual(self.seq.seq, start_seq) self.assertEqual(self.seq.id, start_id) # Check first feature: integron self.assertEqual(self.seq.features[0].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[0].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[0].strand, 0) self.assertEqual(self.seq.features[0].type, "integron") self.assertEqual(self.seq.features[0].qualifiers["integron_id"], infos["ID_integron"]) self.assertEqual(self.seq.features[0].qualifiers["integron_type"], infos["type"]) # Check second feature: promotor self.assertEqual(self.seq.features[1].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[1].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[1].strand, infos["strand"]) self.assertEqual(self.seq.features[1].type, infos["type_elt"]) self.assertEqual(self.seq.features[1].qualifiers["Promoter"], infos["element"]) self.assertEqual(self.seq.features[1].qualifiers["model"], infos["model"])
def test_integron_1elem_prot(self): """ Test add_feature when the only element is an integron composed of 1 protein only. """ infos = {"ID_replicon": self.replicon_id, "ID_integron": "integron_01", "element": "ACBA.007.P01_13_20", "pos_beg": 17375, "pos_end": 17375, "strand": -1, "evalue": np.nan, "type_elt": "protein", "annotation": "protein", "model": "NA", "type": "complete", "default": "Yes", "distance_2attC": np.nan } df = pd.DataFrame(infos, index=[0]) start_seq = self.seq.seq start_id = self.seq.id add_feature(self.seq, df, self.prot_db, self.dist_threshold) # Translation should be protein ACBA.007.P01_13_20 in # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt translate = ("MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY" "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*") # Check that there are 2 features (integron and protein) self.assertEqual(len(self.seq.features), 2) # Check that initial sequence and id are not modified self.assertEqual(self.seq.seq, start_seq) self.assertEqual(self.seq.id, start_id) # Check first feature: integron self.assertEqual(self.seq.features[0].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[0].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[0].strand, 0) self.assertEqual(self.seq.features[0].type, "integron") self.assertEqual(self.seq.features[0].qualifiers["integron_id"], infos["ID_integron"]) self.assertEqual(self.seq.features[0].qualifiers["integron_type"], infos["type"]) # Check second feature: protein self.assertEqual(self.seq.features[1].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[1].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[1].strand, infos["strand"]) self.assertEqual(self.seq.features[1].type, "CDS") self.assertEqual(self.seq.features[1].qualifiers["protein_id"], infos["element"]) self.assertEqual(self.seq.features[1].qualifiers["gene"], infos["annotation"]) self.assertEqual(self.seq.features[1].qualifiers["model"], infos["model"]) self.assertEqual(str(self.seq.features[1].qualifiers["translation"]), translate)
def test_integron_1elem_prom(self): """ Test add_feature when the only element is an integron composed of 1 promoter only. """ infos = {"ID_replicon": self.replicon_id, "ID_integron": "integron_01", "element": "Pc_int1", "pos_beg": 25, "pos_end": 51, "strand": -1, "evalue": np.nan, "type_elt": "Promoter", "annotation": "Pc_1", "model": "NA", "type": "complete", "default": "Yes", "distance_2attC": np.nan } df = pd.DataFrame(infos, index=[0]) start_seq = self.seq.seq start_id = self.seq.id add_feature(self.seq, df, self.prot_file, self.dist_threshold) # Check that there are 2 features (integron and promoter) self.assertEqual(len(self.seq.features), 2) # Check that initial sequence and id are not modified self.assertEqual(self.seq.seq, start_seq) self.assertEqual(self.seq.id, start_id) # Check first feature: integron self.assertEqual(self.seq.features[0].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[0].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[0].strand, 0) self.assertEqual(self.seq.features[0].type, "integron") self.assertEqual(self.seq.features[0].qualifiers["integron_id"], infos["ID_integron"]) self.assertEqual(self.seq.features[0].qualifiers["integron_type"], infos["type"]) # Check second feature: promotor self.assertEqual(self.seq.features[1].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[1].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[1].strand, infos["strand"]) self.assertEqual(self.seq.features[1].type, infos["type_elt"]) self.assertEqual(self.seq.features[1].qualifiers["Promoter"], infos["element"]) self.assertEqual(self.seq.features[1].qualifiers["model"], infos["model"])
def test_integron_long_seqname(self): """ Test add_feature when the only element is an integron composed of 1 protein only. """ infos = { "ID_replicon": self.replicon_id, "ID_integron": "integron_01", "element": "ACBA.007.P01_13_20", "pos_beg": 17375, "pos_end": 17375, "strand": -1, "evalue": np.nan, "type_elt": "protein", "annotation": "protein", "model": "NA", "type": "complete", "default": "Yes", "distance_2attC": np.nan } df = pd.DataFrame(infos, index=[0]) start_seq = self.seq.seq start_id = self.seq.id seq_name = self.seq.name self.seq.name = "abcdefgh" + seq_name add_feature(self.seq, df, self.prot_db, self.dist_threshold) # Translation should be protein ACBA.007.P01_13_20 in # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt # translate = ("MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY" # "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*") # Check that there are 2 features (integron and protein) self.assertEqual(len(self.seq.features), 2) # Check that initial sequence and id are not modified self.assertEqual(self.seq.seq, start_seq) self.assertEqual(self.seq.id, start_id) # Check that sequence name has been shortened self.assertEqual(self.seq.name, "h" + seq_name)
def test_integron_long_seqname(self): """ Test add_feature when the only element is an integron composed of 1 protein only. """ infos = {"ID_replicon": self.replicon_id, "ID_integron": "integron_01", "element": "ACBA.007.P01_13_20", "pos_beg": 17375, "pos_end": 17375, "strand": -1, "evalue": np.nan, "type_elt": "protein", "annotation": "protein", "model": "NA", "type": "complete", "default": "Yes", "distance_2attC": np.nan } df = pd.DataFrame(infos, index=[0]) start_seq = self.seq.seq start_id = self.seq.id seq_name = self.seq.name self.seq.name = "abcdefgh" + seq_name add_feature(self.seq, df, self.prot_db, self.dist_threshold) # Translation should be protein ACBA.007.P01_13_20 in # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt # translate = ("MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY" # "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*") # Check that there are 2 features (integron and protein) self.assertEqual(len(self.seq.features), 2) # Check that initial sequence and id are not modified self.assertEqual(self.seq.seq, start_seq) self.assertEqual(self.seq.id, start_id) # Check that sequence name has been shortened self.assertEqual(self.seq.name, "h" + seq_name)
def find_integron_in_one_replicon(replicon, config): """ scan replicon for integron. * presence of integrase * presence of attC sites * presence of promoters and attI sites depending on the configuration * perform functional annotation produce a file containing presence of putative integrons depending on configuration * produce genbank file with replicon and annotations with integrons * produce schema of replicon with integrons (in pdf) :param replicon: the replicon to analyse. :type replicon: a :class:`Bio.SeqRecord` object. :param config: The configuration :type config: a :class:`integron_finder.config.Config` object. :returns: the path to the integron file (<replicon_id>.integrons) and the summary file (<replicon_id.summary>). if there is no integron the summary file is None :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None) """ result_tmp_dir = config.tmp_dir(replicon.id) try: os.mkdir(result_tmp_dir) except OSError: pass tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst') SeqIO.write(replicon, tmp_replicon_path, "fasta") # create attr path # used to generate protein file with prodigal replicon.path = tmp_replicon_path # func_annot_path is the canonical path for Functional_annotation # path_func_annot is the path provide on the command line if config.func_annot and not config.no_proteins and not config.path_func_annot: if os.path.exists('bank_hmm'): fa_hmm = scan_hmm_bank('bank_hmm') elif os.path.exists(config.func_annot_path): fa_hmm = scan_hmm_bank(config.func_annot_path) else: raise IntegronError("the dir '{}' neither 'bank_hmm' exists, specify the location of hmm " "profile with --path-func-annot option".format(config.func_annot_path)) is_func_annot = True elif config.path_func_annot and config.no_proteins is False: fa_hmm = scan_hmm_bank(config.path_func_annot) is_func_annot = True else: is_func_annot = False if is_func_annot and not fa_hmm: _log.warning("No hmm profiles for functional annotation detected, skip functional annotation step.") if config.gembase_path: protein_db = GembaseDB(replicon, config, gembase_path=config.gembase_path) elif config.gembase: protein_db = GembaseDB(replicon, config) else: protein_db = ProdigalDB(replicon, config) ################## # Default search # ################## intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res") phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res") attC_default_file = os.path.join(result_tmp_dir, replicon.id + "_attc_table.res") try: if not config.no_proteins: if not os.path.isfile(intI_file) or not os.path.isfile(phageI_file): find_integrase(replicon.id, protein_db.protfile, result_tmp_dir, config) _log.info("Starting Default search ... :") if not os.path.isfile(attC_default_file): # find attc with cmsearch find_attc(tmp_replicon_path, replicon.name, config.cmsearch, result_tmp_dir, config.model_attc_path, incE=config.evalue_attc, cpu=config.cpu) _log.info("Default search done... : ") integrons = find_integron(replicon, protein_db, attC_default_file, intI_file, phageI_file, config) ######################### # Search with local_max # ######################### if config.local_max: _log.info("Starting search with local_max...:") if not os.path.isfile(os.path.join(result_tmp_dir, "integron_max.pickle")): circular = True if replicon.topology == 'circ' else False integron_max = find_attc_max(integrons, replicon, config.distance_threshold, config.model_attc_path, max_attc_size=config.max_attc_size, min_attc_size=config.min_attc_size, circular=circular, out_dir=result_tmp_dir, cpu=config.cpu, evalue_attc=config.evalue_attc) integron_max.to_pickle(os.path.join(result_tmp_dir, "integron_max.pickle")) _log.info("Search with local_max done... :") else: integron_max = pd.read_pickle(os.path.join(result_tmp_dir, "integron_max.pickle")) integron_max = integron_max[(integron_max.evalue < config.evalue_attc) & (abs(integron_max.pos_end - integron_max.pos_beg) < config.max_attc_size) & (config.min_attc_size < abs(integron_max.pos_end - integron_max.pos_beg))] _log.info("Search with local_max was already done, continue... :") integrons = find_integron(replicon, protein_db, integron_max, intI_file, phageI_file, config) ########################## # Add promoters and attI # ########################## for integron in integrons: integron_type = integron.type() if integron_type != "In0": # complete & CALIN if not config.no_proteins: _log.info("Adding proteins ... :") integron.add_proteins(protein_db) if config.promoter_attI: _log.info("Adding promoters and attI ... :") if integron_type == "complete": integron.add_promoter() integron.add_attI() elif integron_type == "In0": integron.add_attI() integron.add_promoter() ######################### # Functional annotation # ######################### if is_func_annot and fa_hmm: _log.info("Starting functional annotation ...:") func_annot(integrons, replicon, protein_db, fa_hmm, config, result_tmp_dir) ####################### # Writing out results # ####################### _log.info("Writing out results for replicon {}".format(replicon.id)) if config.pdf: for j, integron in enumerate(integrons, 1): if integron.type() == "complete": integron.draw_integron(file=os.path.join(config.result_dir, "{}_{}.pdf".format(replicon.id, j))) base_outfile = os.path.join(config.result_dir, replicon.id) integron_file = base_outfile + ".integrons" _log.debug("Writing integron_file {}".format(integron_file)) if integrons: integrons_report = results.integrons_report(integrons) integrons_report.to_csv(integron_file, sep="\t", index=False, na_rep="NA") summary = results.summary(integrons_report) summary_file = base_outfile + ".summary" summary.to_csv(summary_file, sep="\t", na_rep="NA", index=False, columns=['ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN']) if config.gbk: add_feature(replicon, integrons_report, protein_db, config.distance_threshold) SeqIO.write(replicon, os.path.join(config.result_dir, replicon.id + ".gbk"), "genbank") else: with open(integron_file, "w") as out_f: out_f.write("# No Integron found\n") summary_file = None except integron_finder.EmptyFileError as err: _log.warning('############ Skip replicon {} ############'.format(replicon.name)) integron_file = '' summary_file = '' ######################### # clean temporary files # ######################### if not config.keep_tmp: try: shutil.rmtree(result_tmp_dir) except Exception as err: _log.warning("Cannot remove temporary results : '{} : {}'".format(result_tmp_dir, str(err))) return integron_file, summary_file
def test_integron_1elem_prot(self): """ Test add_feature when the only element is an integron composed of 1 protein only. """ infos = { "ID_replicon": self.replicon_id, "ID_integron": "integron_01", "element": "ACBA.007.P01_13_20", "pos_beg": 17375, "pos_end": 17375, "strand": -1, "evalue": np.nan, "type_elt": "protein", "annotation": "protein", "model": "NA", "type": "complete", "default": "Yes", "distance_2attC": np.nan } df = pd.DataFrame(infos, index=[0]) start_seq = self.seq.seq start_id = self.seq.id add_feature(self.seq, df, self.prot_db, self.dist_threshold) # Translation should be protein ACBA.007.P01_13_20 in # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt translate = ( "MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY" "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*") # Check that there are 2 features (integron and protein) self.assertEqual(len(self.seq.features), 2) # Check that initial sequence and id are not modified self.assertEqual(self.seq.seq, start_seq) self.assertEqual(self.seq.id, start_id) # Check first feature: integron self.assertEqual(self.seq.features[0].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[0].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[0].strand, 0) self.assertEqual(self.seq.features[0].type, "integron") self.assertEqual(self.seq.features[0].qualifiers["integron_id"], infos["ID_integron"]) self.assertEqual(self.seq.features[0].qualifiers["integron_type"], infos["type"]) # Check second feature: protein self.assertEqual(self.seq.features[1].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[1].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[1].strand, infos["strand"]) self.assertEqual(self.seq.features[1].type, "CDS") self.assertEqual(self.seq.features[1].qualifiers["protein_id"], infos["element"]) self.assertEqual(self.seq.features[1].qualifiers["gene"], infos["annotation"]) self.assertEqual(self.seq.features[1].qualifiers["model"], infos["model"]) self.assertEqual(str(self.seq.features[1].qualifiers["translation"]), translate)
def test_integron_2int_nelem(self): """ Test add_feature when there are 2 integrons: integron 1 with several elements: protein, integrase, promoter integron 2 with only 1 attC site Integrons are not over the edge of sequence """ # integron 1 int_id = "integron_01" int_type = "complete" infos_prom = { "ID_replicon": self.replicon_id, "ID_integron": int_id, "element": "Pc_int1", "pos_beg": 25, "pos_end": 51, "strand": -1, "evalue": np.nan, "type_elt": "Promoter", "annotation": "Pc_1", "model": "NA", "type": int_type, "default": "Yes", "distance_2attC": np.nan } infos_int = { "ID_replicon": self.replicon_id, "ID_integron": int_id, "element": "ACBA.007.P01_13_1", "pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.9e-25, "type_elt": "protein", "annotation": "intI", "model": "NA", "type": int_type, "default": "Yes", "distance_2attC": np.nan } infos_prot = { "ID_replicon": self.replicon_id, "ID_integron": int_id, "element": "ACBA.007.P01_13_20", "pos_beg": 2000, "pos_end": 2056, "strand": -1, "evalue": np.nan, "type_elt": "protein", "annotation": "protein", "model": "intersection_tyr_intI", "type": int_type, "default": "Yes", "distance_2attC": np.nan } # integron 2 infos_attC = { "ID_replicon": self.replicon_id, "ID_integron": "integron_02", "element": "attc_001", "pos_beg": 17825, "pos_end": 17884, "strand": -1, "evalue": 1e-9, "type_elt": "attC", "annotation": "attC", "model": "attc_4", "type": int_type, "default": "Yes", "distance_2attC": np.nan } df1 = pd.DataFrame(infos_prom, index=[0]) df2 = pd.DataFrame(infos_int, index=[0]) df3 = pd.DataFrame(infos_prot, index=[0]) df4 = pd.DataFrame(infos_attC, index=[0]) df = pd.concat([df1, df2, df3, df4]) start_seq = self.seq.seq start_id = self.seq.id tr_int = ( "MKTATAPLPPLRSVKVLDQLRERIRYLHYSLRTEQAYVNWVRAFIRFHGVRHPATLGSSE" "VEAFLSWLANERKVSVSTHRQALAALLFFYGKVLCTDLPWLQEIGRPRPSRRLPVVLTPD" "EVVRILGFLEGEHRLFAQLLYGTGMRISEGLQLRVKDLDFDHGTIIVREGKGSKDRALML" "PESLAPSLREQLSRARAWWLKDQAEGRSGVALPDALERKYPRAGHSWPWFWVFAQHTHST" "DPRSGVVRRHHMYDQTFQRAFKRAVEGTVAKLAMRQPFVLFKGLTFQKLCLPGAFRPGDH" "HNKMLRPGLCVVHASPQYL*") tr_prot = ( "MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY" "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*") add_feature(self.seq, df, self.prot_db, self.dist_threshold) # Check that there are 6 features (integron1, promoter, integrase, protein, # integron2, attC) self.assertEqual(len(self.seq.features), 6) # Check that initial sequence and id are not modified self.assertEqual(self.seq.seq, start_seq) self.assertEqual(self.seq.id, start_id) # Check first feature: integron1 self.assertEqual(self.seq.features[0].location.start, infos_prom["pos_beg"] - 1) self.assertEqual(self.seq.features[0].location.end, infos_prot["pos_end"]) self.assertEqual(self.seq.features[0].strand, 0) self.assertEqual(self.seq.features[0].type, "integron") self.assertEqual(self.seq.features[0].qualifiers["integron_id"], int_id) self.assertEqual(self.seq.features[0].qualifiers["integron_type"], int_type) # Check feature 2: promoter self.assertEqual(self.seq.features[1].location.start, infos_prom["pos_beg"] - 1) self.assertEqual(self.seq.features[1].location.end, infos_prom["pos_end"]) self.assertEqual(self.seq.features[1].strand, infos_prom["strand"]) self.assertEqual(self.seq.features[1].type, "Promoter") self.assertEqual(self.seq.features[1].qualifiers["Promoter"], infos_prom["element"]) self.assertEqual(self.seq.features[1].qualifiers["model"], infos_prom["model"]) # Check feature 3: integrase self.assertEqual(self.seq.features[2].location.start, infos_int["pos_beg"] - 1) self.assertEqual(self.seq.features[2].location.end, infos_int["pos_end"]) self.assertEqual(self.seq.features[2].strand, infos_int["strand"]) self.assertEqual(self.seq.features[2].type, "integrase") self.assertEqual(self.seq.features[2].qualifiers["protein_id"], infos_int["element"]) self.assertEqual(self.seq.features[2].qualifiers["gene"], infos_int["annotation"]) self.assertEqual(self.seq.features[2].qualifiers["model"], infos_int["model"]) self.assertEqual(str(self.seq.features[2].qualifiers["translation"]), tr_int) # Check feature 4: protein self.assertEqual(self.seq.features[3].location.start, infos_prot["pos_beg"] - 1) self.assertEqual(self.seq.features[3].location.end, infos_prot["pos_end"]) self.assertEqual(self.seq.features[3].strand, infos_prot["strand"]) self.assertEqual(self.seq.features[3].type, "CDS") self.assertEqual(self.seq.features[3].qualifiers["protein_id"], infos_prot["element"]) self.assertEqual(self.seq.features[3].qualifiers["gene"], infos_prot["annotation"]) self.assertEqual(self.seq.features[3].qualifiers["model"], infos_prot["model"]) self.assertEqual(str(self.seq.features[3].qualifiers["translation"]), tr_prot) # Check feature 5: integron2 self.assertEqual(self.seq.features[4].location.start, infos_attC["pos_beg"] - 1) self.assertEqual(self.seq.features[4].location.end, infos_attC["pos_end"]) self.assertEqual(self.seq.features[4].strand, 0) self.assertEqual(self.seq.features[4].type, "integron") self.assertEqual(self.seq.features[4].qualifiers["integron_id"], "integron_02") self.assertEqual(self.seq.features[4].qualifiers["integron_type"], int_type) # Check feature 6: attC self.assertEqual(self.seq.features[5].location.start, infos_attC["pos_beg"] - 1) self.assertEqual(self.seq.features[5].location.end, infos_attC["pos_end"]) self.assertEqual(self.seq.features[5].strand, infos_attC["strand"]) self.assertEqual(self.seq.features[5].type, "attC") self.assertEqual(self.seq.features[5].qualifiers["attC"], infos_attC["element"]) self.assertEqual(self.seq.features[5].qualifiers["model"], infos_attC["model"])
def test_integron_1elem_int(self): """ Test add_feature when the only element is an integron composed of 1 integrase only. """ infos = { "ID_replicon": self.replicon_id, "ID_integron": "integron_01", "element": "ACBA.007.P01_13_1", "pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.9e-25, "type_elt": "protein", "annotation": "intI", "model": "intersection_tyr_intI", "type": "complete", "default": "Yes", "distance_2attC": np.nan } df = pd.DataFrame(infos, index=[0]) start_seq = self.seq.seq start_id = self.seq.id add_feature(self.seq, df, self.prot_db, self.dist_threshold) # Translation should be protein ACBA.007.P01_13_1 in # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt translate = ( "MKTATAPLPPLRSVKVLDQLRERIRYLHYSLRTEQAYVNWVRAFIRFHGVRHPATLGSSE" "VEAFLSWLANERKVSVSTHRQALAALLFFYGKVLCTDLPWLQEIGRPRPSRRLPVVLTPD" "EVVRILGFLEGEHRLFAQLLYGTGMRISEGLQLRVKDLDFDHGTIIVREGKGSKDRALML" "PESLAPSLREQLSRARAWWLKDQAEGRSGVALPDALERKYPRAGHSWPWFWVFAQHTHST" "DPRSGVVRRHHMYDQTFQRAFKRAVEGTVAKLAMRQPFVLFKGLTFQKLCLPGAFRPGDH" "HNKMLRPGLCVVHASPQYL*") # Check that there are 2 features (integron and protein) self.assertEqual(len(self.seq.features), 2) # Check that initial sequence and id are not modified self.assertEqual(self.seq.seq, start_seq) self.assertEqual(self.seq.id, start_id) # Check first feature: integron self.assertEqual(self.seq.features[0].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[0].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[0].strand, 0) self.assertEqual(self.seq.features[0].type, "integron") self.assertEqual(self.seq.features[0].qualifiers["integron_id"], infos["ID_integron"]) self.assertEqual(self.seq.features[0].qualifiers["integron_type"], infos["type"]) # Check second feature: protein self.assertEqual(self.seq.features[1].location.start, infos["pos_beg"] - 1) self.assertEqual(self.seq.features[1].location.end, infos["pos_end"]) self.assertEqual(self.seq.features[1].strand, infos["strand"]) self.assertEqual(self.seq.features[1].type, "integrase") self.assertEqual(self.seq.features[1].qualifiers["protein_id"], infos["element"]) self.assertEqual(self.seq.features[1].qualifiers["gene"], infos["annotation"]) self.assertEqual(self.seq.features[1].qualifiers["model"], infos["model"]) self.assertEqual(str(self.seq.features[1].qualifiers["translation"]), translate)
def find_integron_in_one_replicon(replicon, config): """ scan replicon for integron. * presence of integrase * presence of attC sites * presence of promoters and attI sites depending on the configuration * perform functional annotation produce a file containing presence of putative integrons depending on configuration * produce genbank file with replicon and annotations with integrons * produce schema of replicon with integrons (in pdf) :param replicon: the replicon to analyse. :type replicon: a :class:`Bio.SeqRecord` object. :param config: The configuration :type config: a :class:`integron_finder.config.Config` object. :returns: the path to the integron file (<replicon_id>.integrons) and the summary file (<replicon_id.summary>). if there is no integron the summary file is None :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None) """ result_tmp_dir = config.tmp_dir(replicon.id) try: os.mkdir(result_tmp_dir) except OSError: pass tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst') SeqIO.write(replicon, tmp_replicon_path, "fasta") # create attr path # used to generate protein file with prodigal replicon.path = tmp_replicon_path # func_annot_path is the canonical path for Functional_annotation # path_func_annot is the path provide on the command line if config.func_annot and not config.no_proteins and not config.path_func_annot: if os.path.exists('bank_hmm'): fa_hmm = scan_hmm_bank('bank_hmm') elif os.path.exists(config.func_annot_path): fa_hmm = scan_hmm_bank(config.func_annot_path) else: raise IntegronError( "the dir '{}' neither 'bank_hmm' exists, specify the location of hmm " "profile with --path-func-annot option".format( config.func_annot_path)) is_func_annot = True elif config.path_func_annot and config.no_proteins is False: fa_hmm = scan_hmm_bank(config.path_func_annot) is_func_annot = True else: is_func_annot = False if is_func_annot and not fa_hmm: _log.warning( "No hmm profiles for functional annotation detected, skip functional annotation step." ) if config.gembase_path: protein_db = GembaseDB(replicon, config, gembase_path=config.gembase_path) elif config.gembase: protein_db = GembaseDB(replicon, config) else: protein_db = ProdigalDB(replicon, config) ################## # Default search # ################## intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res") phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res") attC_default_file = os.path.join(result_tmp_dir, replicon.id + "_attc_table.res") try: if not config.no_proteins: if not os.path.isfile(intI_file) or not os.path.isfile( phageI_file): find_integrase(replicon.id, protein_db.protfile, result_tmp_dir, config) _log.info("Starting Default search ... :") if not os.path.isfile(attC_default_file): # find attc with cmsearch find_attc(tmp_replicon_path, replicon.name, config.cmsearch, result_tmp_dir, config.model_attc_path, incE=config.evalue_attc, cpu=config.cpu) _log.info("Default search done... : ") integrons = find_integron(replicon, protein_db, attC_default_file, intI_file, phageI_file, config) ######################### # Search with local_max # ######################### if config.local_max: _log.info("Starting search with local_max...:") if not os.path.isfile( os.path.join(result_tmp_dir, "integron_max.pickle")): circular = True if replicon.topology == 'circ' else False integron_max = find_attc_max( integrons, replicon, config.distance_threshold, config.model_attc_path, max_attc_size=config.max_attc_size, min_attc_size=config.min_attc_size, circular=circular, out_dir=result_tmp_dir, cpu=config.cpu, evalue_attc=config.evalue_attc) integron_max.to_pickle( os.path.join(result_tmp_dir, "integron_max.pickle")) _log.info("Search with local_max done... :") else: integron_max = pd.read_pickle( os.path.join(result_tmp_dir, "integron_max.pickle")) integron_max = integron_max[ (integron_max.evalue < config.evalue_attc) & (abs(integron_max.pos_end - integron_max.pos_beg) < config.max_attc_size) & (config.min_attc_size < abs(integron_max.pos_end - integron_max.pos_beg))] _log.info( "Search with local_max was already done, continue... :") integrons = find_integron(replicon, protein_db, integron_max, intI_file, phageI_file, config) ########################## # Add promoters and attI # ########################## for integron in integrons: integron_type = integron.type() if integron_type != "In0": # complete & CALIN if not config.no_proteins: _log.info("Adding proteins ... :") integron.add_proteins(protein_db) if config.promoter_attI: _log.info("Adding promoters and attI ... :") if integron_type == "complete": integron.add_promoter() integron.add_attI() elif integron_type == "In0": integron.add_attI() integron.add_promoter() ######################### # Functional annotation # ######################### if is_func_annot and fa_hmm: _log.info("Starting functional annotation ...:") func_annot(integrons, replicon, protein_db, fa_hmm, config, result_tmp_dir) ####################### # Writing out results # ####################### _log.info("Writing out results for replicon {}".format(replicon.id)) if config.pdf: for j, integron in enumerate(integrons, 1): if integron.type() == "complete": integron.draw_integron(file=os.path.join( config.result_dir, "{}_{}.pdf".format(replicon.id, j))) base_outfile = os.path.join(config.result_dir, replicon.id) integron_file = base_outfile + ".integrons" _log.debug("Writing integron_file {}".format(integron_file)) if integrons: integrons_report = results.integrons_report(integrons) integrons_report.to_csv(integron_file, sep="\t", index=False, na_rep="NA") summary = results.summary(integrons_report) summary_file = base_outfile + ".summary" summary.to_csv(summary_file, sep="\t", na_rep="NA", index=False, columns=[ 'ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN' ]) if config.gbk: add_feature(replicon, integrons_report, protein_db, config.distance_threshold) SeqIO.write( replicon, os.path.join(config.result_dir, replicon.id + ".gbk"), "genbank") else: with open(integron_file, "w") as out_f: out_f.write("# No Integron found\n") summary_file = None except integron_finder.EmptyFileError as err: _log.warning('############ Skip replicon {} ############'.format( replicon.name)) integron_file = '' summary_file = '' ######################### # clean temporary files # ######################### if not config.keep_tmp: try: shutil.rmtree(result_tmp_dir) except Exception as err: _log.warning("Cannot remove temporary results : '{} : {}'".format( result_tmp_dir, str(err))) return integron_file, summary_file
def test_integron_2int_nelem(self): """ Test add_feature when there are 2 integrons: integron 1 with several elements: protein, integrase, promoter integron 2 with only 1 attC site Integrons are not over the edge of sequence """ # integron 1 int_id = "integron_01" int_type = "complete" infos_prom = {"ID_replicon": self.replicon_id, "ID_integron": int_id, "element": "Pc_int1", "pos_beg": 25, "pos_end": 51, "strand": -1, "evalue": np.nan, "type_elt": "Promoter", "annotation": "Pc_1", "model": "NA", "type": int_type, "default": "Yes", "distance_2attC": np.nan } infos_int = {"ID_replicon": self.replicon_id, "ID_integron": int_id, "element": "ACBA.007.P01_13_1", "pos_beg": 55, "pos_end": 1014, "strand": 1, "evalue": 1.9e-25, "type_elt": "protein", "annotation": "intI", "model": "NA", "type": int_type, "default": "Yes", "distance_2attC": np.nan } infos_prot = {"ID_replicon": self.replicon_id, "ID_integron": int_id, "element": "ACBA.007.P01_13_20", "pos_beg": 2000, "pos_end": 2056, "strand": -1, "evalue": np.nan, "type_elt": "protein", "annotation": "protein", "model": "intersection_tyr_intI", "type": int_type, "default": "Yes", "distance_2attC": np.nan } # integron 2 infos_attC = {"ID_replicon": self.replicon_id, "ID_integron": "integron_02", "element": "attc_001", "pos_beg": 17825, "pos_end": 17884, "strand": -1, "evalue": 1e-9, "type_elt": "attC", "annotation": "attC", "model": "attc_4", "type": int_type, "default": "Yes", "distance_2attC": np.nan } df1 = pd.DataFrame(infos_prom, index=[0]) df2 = pd.DataFrame(infos_int, index=[0]) df3 = pd.DataFrame(infos_prot, index=[0]) df4 = pd.DataFrame(infos_attC, index=[0]) df = pd.concat([df1, df2, df3, df4]) start_seq = self.seq.seq start_id = self.seq.id tr_int = ("MKTATAPLPPLRSVKVLDQLRERIRYLHYSLRTEQAYVNWVRAFIRFHGVRHPATLGSSE" "VEAFLSWLANERKVSVSTHRQALAALLFFYGKVLCTDLPWLQEIGRPRPSRRLPVVLTPD" "EVVRILGFLEGEHRLFAQLLYGTGMRISEGLQLRVKDLDFDHGTIIVREGKGSKDRALML" "PESLAPSLREQLSRARAWWLKDQAEGRSGVALPDALERKYPRAGHSWPWFWVFAQHTHST" "DPRSGVVRRHHMYDQTFQRAFKRAVEGTVAKLAMRQPFVLFKGLTFQKLCLPGAFRPGDH" "HNKMLRPGLCVVHASPQYL*") tr_prot = ("MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY" "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*") add_feature(self.seq, df, self.prot_db, self.dist_threshold) # Check that there are 6 features (integron1, promoter, integrase, protein, # integron2, attC) self.assertEqual(len(self.seq.features), 6) # Check that initial sequence and id are not modified self.assertEqual(self.seq.seq, start_seq) self.assertEqual(self.seq.id, start_id) # Check first feature: integron1 self.assertEqual(self.seq.features[0].location.start, infos_prom["pos_beg"] - 1) self.assertEqual(self.seq.features[0].location.end, infos_prot["pos_end"]) self.assertEqual(self.seq.features[0].strand, 0) self.assertEqual(self.seq.features[0].type, "integron") self.assertEqual(self.seq.features[0].qualifiers["integron_id"], int_id) self.assertEqual(self.seq.features[0].qualifiers["integron_type"], int_type) # Check feature 2: promoter self.assertEqual(self.seq.features[1].location.start, infos_prom["pos_beg"] - 1) self.assertEqual(self.seq.features[1].location.end, infos_prom["pos_end"]) self.assertEqual(self.seq.features[1].strand, infos_prom["strand"]) self.assertEqual(self.seq.features[1].type, "Promoter") self.assertEqual(self.seq.features[1].qualifiers["Promoter"], infos_prom["element"]) self.assertEqual(self.seq.features[1].qualifiers["model"], infos_prom["model"]) # Check feature 3: integrase self.assertEqual(self.seq.features[2].location.start, infos_int["pos_beg"] - 1) self.assertEqual(self.seq.features[2].location.end, infos_int["pos_end"]) self.assertEqual(self.seq.features[2].strand, infos_int["strand"]) self.assertEqual(self.seq.features[2].type, "integrase") self.assertEqual(self.seq.features[2].qualifiers["protein_id"], infos_int["element"]) self.assertEqual(self.seq.features[2].qualifiers["gene"], infos_int["annotation"]) self.assertEqual(self.seq.features[2].qualifiers["model"], infos_int["model"]) self.assertEqual(str(self.seq.features[2].qualifiers["translation"]), tr_int) # Check feature 4: protein self.assertEqual(self.seq.features[3].location.start, infos_prot["pos_beg"] - 1) self.assertEqual(self.seq.features[3].location.end, infos_prot["pos_end"]) self.assertEqual(self.seq.features[3].strand, infos_prot["strand"]) self.assertEqual(self.seq.features[3].type, "CDS") self.assertEqual(self.seq.features[3].qualifiers["protein_id"], infos_prot["element"]) self.assertEqual(self.seq.features[3].qualifiers["gene"], infos_prot["annotation"]) self.assertEqual(self.seq.features[3].qualifiers["model"], infos_prot["model"]) self.assertEqual(str(self.seq.features[3].qualifiers["translation"]), tr_prot) # Check feature 5: integron2 self.assertEqual(self.seq.features[4].location.start, infos_attC["pos_beg"] - 1) self.assertEqual(self.seq.features[4].location.end, infos_attC["pos_end"]) self.assertEqual(self.seq.features[4].strand, 0) self.assertEqual(self.seq.features[4].type, "integron") self.assertEqual(self.seq.features[4].qualifiers["integron_id"], "integron_02") self.assertEqual(self.seq.features[4].qualifiers["integron_type"], int_type) # Check feature 6: attC self.assertEqual(self.seq.features[5].location.start, infos_attC["pos_beg"] - 1) self.assertEqual(self.seq.features[5].location.end, infos_attC["pos_end"]) self.assertEqual(self.seq.features[5].strand, infos_attC["strand"]) self.assertEqual(self.seq.features[5].type, "attC") self.assertEqual(self.seq.features[5].qualifiers["attC"], infos_attC["element"]) self.assertEqual(self.seq.features[5].qualifiers["model"], infos_attC["model"])