def test_iter(self): # test Gembase Draft seq_name = 'ACBA.0917.00019' ext = '.fna' replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path db = GembaseDB(replicon, cfg) idx = SeqIO.index(self.find_data( os.path.join('Gembase', 'Proteins', seq_name + '.prt')), 'fasta', alphabet=Seq.IUPAC.extended_protein) specie, date, strain, contig = replicon.id.split('.') pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig) self.assertListEqual(sorted([i for i in idx if re.match(pattern, i)]), sorted([i for i in db])) # test Gembase Complet seq_name = 'ESCO001.C.00001.C001' ext = '.fst' replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path with self.catch_log(): db = GembaseDB(replicon, cfg) idx = SeqIO.index(self.find_data( os.path.join('Gembase', 'Proteins', seq_name + '.prt')), 'fasta', alphabet=Seq.IUPAC.extended_protein) specie, date, strain, contig = replicon.id.split('.') pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig) seqid_from_gembase_protfile = set( [i for i in idx if re.match(pattern, i)]) seqid_from_if = set([i for i in db]) non_common_seq = seqid_from_gembase_protfile ^ seqid_from_if # in Gembase complete the annotation from lstinfo provided from genbank # it appear some times that some CDS are not translate in proteins # So in data I have 3 genes from LSTINFO are not in .prt file diff = { 'ESCO001.C.00001.C001_03974', 'ESCO001.C.00001.C001_01509', 'ESCO001.C.00001.C001_04162' } self.assertSetEqual(non_common_seq, diff)
def test_find_gembase_file_basename_file_not_in_gembase(self): """ test if find_gembase_file_basename get the the right basename for files not located in gembase and file name is the output of split operation a file containing one contig a file representing a chunk """ gembase_path = self.find_data('Gembase') file_names = { 'ACBA.0917.00019': self.find_data( os.path.join('Replicons', 'ACBA.0917.00019.0001.fst')), 'ESCO001.C.00001.C001.fst': os.path.join(self.tmp_dir, 'ESCO001.C.00001.C001_chunk_1.fst') } shutil.copyfile( os.path.join(gembase_path, 'Replicons', 'ESCO001.C.00001.C001.fst'), file_names['ESCO001.C.00001.C001.fst']) for base_file_name, replicon_path in file_names.items(): self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertTrue( db._find_gembase_file_basename(gembase_path, replicon_path), base_file_name) replicon_path = self.find_data( os.path.join('Replicons', 'acba.007.p01.13.fst')) self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.assertRaises(FileNotFoundError) as ctx: with self.catch_log(): GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertEqual( str(ctx.exception), 'cannot find lst file matching {} sequence'.format(replicon_path))
def test_gembase_draft_parser(self): replicon_name = 'ACBA.0917.00019' replicon_id = 'ACBA.0917.00019.0001' lst_path = self.find_data( os.path.join('Gembase', 'LSTINFO', replicon_name + '.lst')) prots_info = GembaseDB.gembase_draft_parser(lst_path, replicon_id) columns = [ 'start', 'end', 'strand', 'type', 'seq_id', 'gene_name', 'description' ] self.assertListEqual(list(prots_info.columns), columns) self.assertEqual(prots_info.shape, (3870, len(columns))) first_row = [ 266, 1480, 'C', 'CDS', 'ACBA.0917.00019.b0001_00001', 'tyrS', '| Tyrosine--tRNA ligase | 6.1.1.1 | similar to AA sequence:UniProtKB:P41256' ] recieved_first_row = prots_info.iloc[0].values.tolist() self.assertListEqual(first_row, recieved_first_row) last_row = [ 4043755, 4044354, 'C', 'CDS', 'ACBA.0917.00019.i0001_03957', 'yfcG_3', '| Disulfide-bond oxidoreductase YfcG | 1.8.4.- | similar to AA sequence:UniProtKB:P77526' ] recieved_last_row = prots_info.iloc[len(prots_info) - 1].values.tolist() self.assertListEqual(last_row, recieved_last_row)
def test_make_protfile(self): file_name = (('ACBA.0917.00019', '.fna', 3870), ('ESCO001.C.00001.C001', '.fst', 3870)) for seq_name, ext, seq_nb in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) for seq_nb, seqs in enumerate( zip( read_multi_prot_fasta( self.find_data( os.path.join('Gembase', 'Proteins', seq_name + '.prt'))), read_multi_prot_fasta(db.protfile)), 1): expected, test = seqs self.assertEqual(expected.id, test.id) self.assertEqual(seq_nb, seq_nb)
def test_getitem(self): file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst')) for seq_name, ext in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) exp = read_multi_prot_fasta( self.find_data( os.path.join('Gembase', 'Proteins', seq_name + '.prt'))) specie, date, strain, contig = replicon.id.split('.') pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig) for prot_expected in exp: if re.match(pattern, prot_expected.id): prot_received = db[prot_expected.id] self.assertEqual(prot_received.id, prot_expected.id) self.assertEqual(prot_received.seq, prot_expected.seq) with self.assertRaises(KeyError) as ctx: db['nimport_naoik'] self.assertEqual(str(ctx.exception), "'nimport_naoik'")
def test_gembase_complete_parser(self): replicon_id = 'ESCO001.C.00001.C001' lst_path = self.find_data( os.path.join('Gembase', 'LSTINFO', replicon_id + '.lst')) prots_info = GembaseDB.gembase_complete_parser(lst_path, replicon_id) columns = [ 'start', 'end', 'strand', 'type', 'seq_id', 'valid', 'gene_name', 'description' ] self.assertListEqual(list(prots_info.columns), columns) self.assertEqual(prots_info.shape, (4139, len(columns))) first_row = [ 190, 255, 'D', 'CDS', 'ESCO001.C.00001.C001_00001', 'Valid', 'thrL', '@b0001@NP_414542.1@ b0001 1 190 255 | leader; Amino acid biosynthesis:' ' Threonine thr operon leader peptide | ..' ] recieved_first_row = prots_info.iloc[0].values.tolist() self.assertListEqual(first_row, recieved_first_row) last_row = [ 4640942, 4641628, 'D', 'CDS', 'ESCO001.C.00001.C001_04495', 'Valid', 'yjtD', '@b4403@NP_418820.1@ b4403 1 4640942 4641628 | putative methyltransferase | ..' ] recieved_last_row = prots_info.iloc[len(prots_info) - 1].values.tolist() self.assertListEqual(last_row, recieved_last_row)
def test_gembase_sniffer(self): file_names = (('ACBA.0917.00019', 'Draft'), ('ESCO001.C.00001.C001', 'Complet')) for file_name, gem_type in file_names: lst_path = self.find_data( os.path.join('Gembase', 'LSTINFO', file_name + '.lst')) type_recieved = GembaseDB.gembase_sniffer(lst_path) self.assertEqual(type_recieved, gem_type)
def test_find_gembase_file_basename_file_not_in_gembase(self): """ test if find_gembase_file_basename get the the right basename for files not located in gembase and file name is the output of split operation a file containing one contig a file representing a chunk """ gembase_path = self.find_data('Gembase') file_names = {'ACBA.0917.00019': self.find_data(os.path.join('Replicons', 'ACBA.0917.00019.0001.fst')), 'ESCO001.C.00001.C001.fst': os.path.join(self.tmp_dir, 'ESCO001.C.00001.C001_chunk_1.fst') } shutil.copyfile(os.path.join(gembase_path, 'Replicons', 'ESCO001.C.00001.C001.fst'), file_names['ESCO001.C.00001.C001.fst']) for base_file_name, replicon_path in file_names.items(): self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertTrue(db._find_gembase_file_basename(gembase_path, replicon_path), base_file_name) replicon_path = self.find_data(os.path.join('Replicons', 'acba.007.p01.13.fst')) self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.assertRaises(FileNotFoundError) as ctx: with self.catch_log(): GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertEqual(str(ctx.exception), 'cannot find lst file matching {} sequence'.format(replicon_path))
def test_find_gembase_file_basename(self): """ test if find_gembase_file_basename get the the right basename for files in gembase """ gembase_path = self.find_data('Gembase') file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue(db._find_gembase_file_basename(gembase_path, replicon_path), os.path.splitext(file_name)[0])
def test_get_description(self): # SeqDesc(id, strand, strat, stop) file_name = {('ACBA.0917.00019', '.fna'): {'ACBA.0917.00019.b0001_00001': SeqDesc('ACBA.0917.00019.b0001_00001', -1, 266, 1480), 'ACBA.0917.00019.i0001_03957': SeqDesc('ACBA.0917.00019.i0001_03957', -1, 4043755, 4044354)}, } for seq_name, ext in file_name: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = GembaseDB(replicon, cfg) descriptions = file_name[(seq_name, ext)] for seq_id, desc in descriptions.items(): self.assertEqual(desc, db.get_description(seq_id)) with self.assertRaises(IntegronError) as ctx: db.get_description('nimport_naoik') self.assertEqual(str(ctx.exception), "'nimport_naoik' is not a valid Gembase protein identifier.") with self.assertRaises(KeyError) as ctx: db.get_description('FOO.BAR.00019.i0001_03924') self.assertEqual(str(ctx.exception), "'FOO.BAR.00019.i0001_03924'")
def test_find_gembase_file_basename(self): """ test if find_gembase_file_basename get the the right basename for files in gembase """ gembase_path = self.find_data('Gembase') file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue( db._find_gembase_file_basename(gembase_path, replicon_path), os.path.splitext(file_name)[0])
def test_ProteinDB(self): # From Gembase Draft , Gembase Complete file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue(db.replicon.id, replicon.id)
def test_read_multi(self): """ Test reading hmm results when there are multiple hits: 2 hits on the same protein: keep only the one with the best evalue. 2 hits on 2 different proteins: keep the 2 proteins. """ replicon_id = 'ACBA.0917.00019' contig_id = 'ACBA.0917.00019.0001' result_dir_expected = self.find_data( "Results_Integron_Finder_{}.gembase".format(replicon_id)) replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', replicon_id + '.fna')) prot_file = os.path.join(result_dir_expected, "tmp_{}".format(contig_id), contig_id + '.prt') args = argparse.Namespace() args.gembase = True args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = GembaseDB(replicon, cfg, prot_file=prot_file) infile = self.find_data( os.path.join('fictive_results', "{}_intI_multi.res".format(contig_id))) df = read_hmm(contig_id, prot_db, infile, cfg) exp = pd.DataFrame(data={ "Accession_number": [contig_id] * 2, "query_name": ["Phage_integrase"] * 2, "ID_query": ["PF00589.16"] * 2, "ID_prot": ["ACBA.0917.00019.i0001_00298", "ACBA.0917.00019.i0001_00338"], "strand": [-1, -1], "pos_beg": [311597, 350328], "pos_end": [312631, 351248], "evalue": [5.5e-66, 3.4e-51] }, index=[0, 1]) exp = exp[[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]] pdt.assert_frame_equal(df, exp)
def test_protfile(self): file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst')) for seq_name, ext in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertEqual( os.path.join(cfg.tmp_dir(replicon.id), replicon.id + '.prt'), db.protfile)
def test_gembase_complete_parser(self): replicon_id = 'ESCO001.C.00001.C001' lst_path = self.find_data(os.path.join('Gembase', 'LSTINFO', replicon_id + '.lst')) prots_info = GembaseDB.gembase_complete_parser(lst_path, replicon_id) columns = ['start', 'end', 'strand', 'type', 'seq_id', 'valid', 'gene_name', 'description'] self.assertListEqual(list(prots_info.columns), columns) self.assertEqual(prots_info.shape, (4139, len(columns))) first_row = [190, 255, 'D', 'CDS', 'ESCO001.C.00001.C001_00001', 'Valid', 'thrL', '@b0001@NP_414542.1@ b0001 1 190 255 | leader; Amino acid biosynthesis:' ' Threonine thr operon leader peptide | ..'] recieved_first_row = prots_info.iloc[0].values.tolist() self.assertListEqual(first_row, recieved_first_row) last_row = [4640942, 4641628, 'D', 'CDS', 'ESCO001.C.00001.C001_04495', 'Valid', 'yjtD', '@b4403@NP_418820.1@ b4403 1 4640942 4641628 | putative methyltransferase | ..'] recieved_last_row = prots_info.iloc[len(prots_info) - 1].values.tolist() self.assertListEqual(last_row, recieved_last_row)
def test_gembase_draft_parser(self): replicon_name = 'ACBA.0917.00019' replicon_id = 'ACBA.0917.00019.0001' lst_path = self.find_data(os.path.join('Gembase', 'LSTINFO', replicon_name + '.lst')) prots_info = GembaseDB.gembase_draft_parser(lst_path, replicon_id) columns = ['start', 'end', 'strand', 'type', 'seq_id', 'gene_name', 'description'] self.assertListEqual(list(prots_info.columns), columns) self.assertEqual(prots_info.shape, (3870, len(columns))) first_row = [266, 1480, 'C', 'CDS', 'ACBA.0917.00019.b0001_00001', 'tyrS', '| Tyrosine--tRNA ligase | 6.1.1.1 | similar to AA sequence:UniProtKB:P41256'] recieved_first_row = prots_info.iloc[0].values.tolist() self.assertListEqual(first_row, recieved_first_row) last_row = [4043755, 4044354, 'C', 'CDS', 'ACBA.0917.00019.i0001_03957', 'yfcG_3', '| Disulfide-bond oxidoreductase YfcG | 1.8.4.- | similar to AA sequence:UniProtKB:P77526'] recieved_last_row = prots_info.iloc[len(prots_info) - 1].values.tolist() self.assertListEqual(last_row, recieved_last_row)
def test_read_hmm_gembase(self): """ Test that the hmm hits are well read, when the gembase format is used (.prt file is provided, prodigal is not used to find the proteins). """ replicon_id = 'ACBA.0917.00019' contig_id = 'ACBA.0917.00019.0001' result_dir_expected = self.find_data( "Results_Integron_Finder_{}.gembase".format(replicon_id)) replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', replicon_id + '.fna')) prot_file = os.path.join(result_dir_expected, "tmp_{}".format(contig_id), contig_id + '.prt') infile = os.path.join(result_dir_expected, "tmp_{}".format(contig_id), "{}_intI.res".format(contig_id)) args = argparse.Namespace() args.gembase = True args.replicon = replicon_path cfg = Config(args) sequences_db = read_multi_prot_fasta(replicon_path) replicon = next(sequences_db) prot_db = GembaseDB(replicon, cfg, prot_file=prot_file) df = read_hmm(contig_id, prot_db, infile, cfg) exp = pd.DataFrame(data={ "Accession_number": contig_id, "query_name": "intI_Cterm", "ID_query": "-", "ID_prot": "ACBA.0917.00019.i0001_00298", "strand": -1, "pos_beg": 311597, "pos_end": 312631, "evalue": 3.6e-25 }, index=[0]) exp = exp[[ "Accession_number", "query_name", "ID_query", "ID_prot", "strand", "pos_beg", "pos_end", "evalue" ]] pdt.assert_frame_equal(df, exp)
def test_get_description(self): # SeqDesc(id, strand, strat, stop) file_name = { ('ACBA.0917.00019', '.fna'): { 'ACBA.0917.00019.b0001_00001': SeqDesc('ACBA.0917.00019.b0001_00001', -1, 266, 1480), 'ACBA.0917.00019.i0001_03957': SeqDesc('ACBA.0917.00019.i0001_03957', -1, 4043755, 4044354) }, } for seq_name, ext in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = GembaseDB(replicon, cfg) descriptions = file_name[(seq_name, ext)] for seq_id, desc in descriptions.items(): self.assertEqual(desc, db.get_description(seq_id)) with self.assertRaises(IntegronError) as ctx: db.get_description('nimport_naoik') self.assertEqual( str(ctx.exception), "'nimport_naoik' is not a valid Gembase protein identifier.") with self.assertRaises(KeyError) as ctx: db.get_description('FOO.BAR.00019.i0001_03924') self.assertEqual(str(ctx.exception), "'FOO.BAR.00019.i0001_03924'")
def test_gembase_sniffer(self): file_names = (('ACBA.0917.00019', 'Draft'), ('ESCO001.C.00001.C001', 'Complet')) for file_name, gem_type in file_names: lst_path = self.find_data(os.path.join('Gembase', 'LSTINFO', file_name + '.lst')) type_recieved = GembaseDB.gembase_sniffer(lst_path) self.assertEqual(type_recieved, gem_type)
def find_integron_in_one_replicon(replicon, config): """ scan replicon for integron. * presence of integrase * presence of attC sites * presence of promoters and attI sites depending on the configuration * perform functional annotation produce a file containing presence of putative integrons depending on configuration * produce genbank file with replicon and annotations with integrons * produce schema of replicon with integrons (in pdf) :param replicon: the replicon to analyse. :type replicon: a :class:`Bio.SeqRecord` object. :param config: The configuration :type config: a :class:`integron_finder.config.Config` object. :returns: the path to the integron file (<replicon_id>.integrons) and the summary file (<replicon_id.summary>). if there is no integron the summary file is None :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None) """ result_tmp_dir = config.tmp_dir(replicon.id) try: os.mkdir(result_tmp_dir) except OSError: pass tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst') SeqIO.write(replicon, tmp_replicon_path, "fasta") # create attr path # used to generate protein file with prodigal replicon.path = tmp_replicon_path # func_annot_path is the canonical path for Functional_annotation # path_func_annot is the path provide on the command line if config.func_annot and not config.no_proteins and not config.path_func_annot: if os.path.exists('bank_hmm'): fa_hmm = scan_hmm_bank('bank_hmm') elif os.path.exists(config.func_annot_path): fa_hmm = scan_hmm_bank(config.func_annot_path) else: raise IntegronError( "the dir '{}' neither 'bank_hmm' exists, specify the location of hmm " "profile with --path-func-annot option".format( config.func_annot_path)) is_func_annot = True elif config.path_func_annot and config.no_proteins is False: fa_hmm = scan_hmm_bank(config.path_func_annot) is_func_annot = True else: is_func_annot = False if is_func_annot and not fa_hmm: _log.warning( "No hmm profiles for functional annotation detected, skip functional annotation step." ) if config.gembase_path: protein_db = GembaseDB(replicon, config, gembase_path=config.gembase_path) elif config.gembase: protein_db = GembaseDB(replicon, config) else: protein_db = ProdigalDB(replicon, config) ################## # Default search # ################## intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res") phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res") attC_default_file = os.path.join(result_tmp_dir, replicon.id + "_attc_table.res") try: if not config.no_proteins: if not os.path.isfile(intI_file) or not os.path.isfile( phageI_file): find_integrase(replicon.id, protein_db.protfile, result_tmp_dir, config) _log.info("Starting Default search ... :") if not os.path.isfile(attC_default_file): # find attc with cmsearch find_attc(tmp_replicon_path, replicon.name, config.cmsearch, result_tmp_dir, config.model_attc_path, incE=config.evalue_attc, cpu=config.cpu) _log.info("Default search done... : ") integrons = find_integron(replicon, protein_db, attC_default_file, intI_file, phageI_file, config) ######################### # Search with local_max # ######################### if config.local_max: _log.info("Starting search with local_max...:") if not os.path.isfile( os.path.join(result_tmp_dir, "integron_max.pickle")): circular = True if replicon.topology == 'circ' else False integron_max = find_attc_max( integrons, replicon, config.distance_threshold, config.model_attc_path, max_attc_size=config.max_attc_size, min_attc_size=config.min_attc_size, circular=circular, out_dir=result_tmp_dir, cpu=config.cpu, evalue_attc=config.evalue_attc) integron_max.to_pickle( os.path.join(result_tmp_dir, "integron_max.pickle")) _log.info("Search with local_max done... :") else: integron_max = pd.read_pickle( os.path.join(result_tmp_dir, "integron_max.pickle")) integron_max = integron_max[ (integron_max.evalue < config.evalue_attc) & (abs(integron_max.pos_end - integron_max.pos_beg) < config.max_attc_size) & (config.min_attc_size < abs(integron_max.pos_end - integron_max.pos_beg))] _log.info( "Search with local_max was already done, continue... :") integrons = find_integron(replicon, protein_db, integron_max, intI_file, phageI_file, config) ########################## # Add promoters and attI # ########################## for integron in integrons: integron_type = integron.type() if integron_type != "In0": # complete & CALIN if not config.no_proteins: _log.info("Adding proteins ... :") integron.add_proteins(protein_db) if config.promoter_attI: _log.info("Adding promoters and attI ... :") if integron_type == "complete": integron.add_promoter() integron.add_attI() elif integron_type == "In0": integron.add_attI() integron.add_promoter() ######################### # Functional annotation # ######################### if is_func_annot and fa_hmm: _log.info("Starting functional annotation ...:") func_annot(integrons, replicon, protein_db, fa_hmm, config, result_tmp_dir) ####################### # Writing out results # ####################### _log.info("Writing out results for replicon {}".format(replicon.id)) if config.pdf: for j, integron in enumerate(integrons, 1): if integron.type() == "complete": integron.draw_integron(file=os.path.join( config.result_dir, "{}_{}.pdf".format(replicon.id, j))) base_outfile = os.path.join(config.result_dir, replicon.id) integron_file = base_outfile + ".integrons" _log.debug("Writing integron_file {}".format(integron_file)) if integrons: integrons_report = results.integrons_report(integrons) integrons_report.to_csv(integron_file, sep="\t", index=False, na_rep="NA") summary = results.summary(integrons_report) summary_file = base_outfile + ".summary" summary.to_csv(summary_file, sep="\t", na_rep="NA", index=False, columns=[ 'ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN' ]) if config.gbk: add_feature(replicon, integrons_report, protein_db, config.distance_threshold) SeqIO.write( replicon, os.path.join(config.result_dir, replicon.id + ".gbk"), "genbank") else: with open(integron_file, "w") as out_f: out_f.write("# No Integron found\n") summary_file = None except integron_finder.EmptyFileError as err: _log.warning('############ Skip replicon {} ############'.format( replicon.name)) integron_file = '' summary_file = '' ######################### # clean temporary files # ######################### if not config.keep_tmp: try: shutil.rmtree(result_tmp_dir) except Exception as err: _log.warning("Cannot remove temporary results : '{} : {}'".format( result_tmp_dir, str(err))) return integron_file, summary_file