def test_find_gembase_file_basename_file_not_in_gembase(self): """ test if find_gembase_file_basename get the the right basename for files not located in gembase and file name is the output of split operation a file containing one contig a file representing a chunk """ gembase_path = self.find_data('Gembase') file_names = { 'ACBA.0917.00019': self.find_data( os.path.join('Replicons', 'ACBA.0917.00019.0001.fst')), 'ESCO001.C.00001.C001.fst': os.path.join(self.tmp_dir, 'ESCO001.C.00001.C001_chunk_1.fst') } shutil.copyfile( os.path.join(gembase_path, 'Replicons', 'ESCO001.C.00001.C001.fst'), file_names['ESCO001.C.00001.C001.fst']) for base_file_name, replicon_path in file_names.items(): self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertTrue( db._find_gembase_file_basename(gembase_path, replicon_path), base_file_name) replicon_path = self.find_data( os.path.join('Replicons', 'acba.007.p01.13.fst')) self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.assertRaises(FileNotFoundError) as ctx: with self.catch_log(): GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertEqual( str(ctx.exception), 'cannot find lst file matching {} sequence'.format(replicon_path))
def test_protfile(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) self.assertEqual(os.path.join(cfg.tmp_dir(replicon.id), prot_name), db.protfile)
def test_protfile(self): file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst')) for seq_name, ext in file_name: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertEqual(os.path.join(cfg.tmp_dir(replicon.id), replicon.id + '.prt'), db.protfile)
def test_protfile(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) self.assertEqual(os.path.join(cfg.tmp_dir(replicon.id), prot_name), db.protfile)
def test_getitem(self): file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst')) for seq_name, ext in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) exp = read_multi_prot_fasta( self.find_data( os.path.join('Gembase', 'Proteins', seq_name + '.prt'))) specie, date, strain, contig = replicon.id.split('.') pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig) for prot_expected in exp: if re.match(pattern, prot_expected.id): prot_received = db[prot_expected.id] self.assertEqual(prot_received.id, prot_expected.id) self.assertEqual(prot_received.seq, prot_expected.seq) with self.assertRaises(KeyError) as ctx: db['nimport_naoik'] self.assertEqual(str(ctx.exception), "'nimport_naoik'")
def test_get_description(self): # SeqDesc(id, strand, strat, stop) file_name = {('ACBA.0917.00019', '.fna'): {'ACBA.0917.00019.b0001_00001': SeqDesc('ACBA.0917.00019.b0001_00001', -1, 266, 1480), 'ACBA.0917.00019.i0001_03957': SeqDesc('ACBA.0917.00019.i0001_03957', -1, 4043755, 4044354)}, } for seq_name, ext in file_name: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = GembaseDB(replicon, cfg) descriptions = file_name[(seq_name, ext)] for seq_id, desc in descriptions.items(): self.assertEqual(desc, db.get_description(seq_id)) with self.assertRaises(IntegronError) as ctx: db.get_description('nimport_naoik') self.assertEqual(str(ctx.exception), "'nimport_naoik' is not a valid Gembase protein identifier.") with self.assertRaises(KeyError) as ctx: db.get_description('FOO.BAR.00019.i0001_03924') self.assertEqual(str(ctx.exception), "'FOO.BAR.00019.i0001_03924'")
def test_getitem(self): file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst')) for seq_name, ext in file_name: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) exp = read_multi_prot_fasta(self.find_data(os.path.join('Gembase', 'Proteins', seq_name + '.prt'))) specie, date, strain, contig = replicon.id.split('.') pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig) for prot_expected in exp: if re.match(pattern, prot_expected.id): prot_received = db[prot_expected.id] self.assertEqual(prot_received.id, prot_expected.id) self.assertEqual(prot_received.seq, prot_expected.seq) with self.assertRaises(KeyError) as ctx: db['nimport_naoik'] self.assertEqual(str(ctx.exception), "'nimport_naoik'")
def test_make_protfile(self): file_name = (('ACBA.0917.00019', '.fna', 3870), ('ESCO001.C.00001.C001', '.fst', 3870)) for seq_name, ext, seq_nb in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) for seq_nb, seqs in enumerate( zip( read_multi_prot_fasta( self.find_data( os.path.join('Gembase', 'Proteins', seq_name + '.prt'))), read_multi_prot_fasta(db.protfile)), 1): expected, test = seqs self.assertEqual(expected.id, test.id) self.assertEqual(seq_nb, seq_nb)
def test_protfile(self): file_name = (('ACBA.0917.00019', '.fna'), ('ESCO001.C.00001.C001', '.fst')) for seq_name, ext in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertEqual( os.path.join(cfg.tmp_dir(replicon.id), replicon.id + '.prt'), db.protfile)
def test_find_gembase_file_basename_file_not_in_gembase(self): """ test if find_gembase_file_basename get the the right basename for files not located in gembase and file name is the output of split operation a file containing one contig a file representing a chunk """ gembase_path = self.find_data('Gembase') file_names = {'ACBA.0917.00019': self.find_data(os.path.join('Replicons', 'ACBA.0917.00019.0001.fst')), 'ESCO001.C.00001.C001.fst': os.path.join(self.tmp_dir, 'ESCO001.C.00001.C001_chunk_1.fst') } shutil.copyfile(os.path.join(gembase_path, 'Replicons', 'ESCO001.C.00001.C001.fst'), file_names['ESCO001.C.00001.C001.fst']) for base_file_name, replicon_path in file_names.items(): self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertTrue(db._find_gembase_file_basename(gembase_path, replicon_path), base_file_name) replicon_path = self.find_data(os.path.join('Replicons', 'acba.007.p01.13.fst')) self.args.replicon = replicon_path self.args.gembase_path = gembase_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.assertRaises(FileNotFoundError) as ctx: with self.catch_log(): GembaseDB(replicon, cfg, gembase_path=gembase_path) self.assertEqual(str(ctx.exception), 'cannot find lst file matching {} sequence'.format(replicon_path))
def test_ProteinDB(self): file_name = 'acba.007.p01.13' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) self.assertTrue(db.replicon.id, replicon.id)
def test_ProteinDB_no_prodigal(self): file_name = 'acba.007.p01.13' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) self.args.prodigal = None with self.assertRaises(RuntimeError) as ctx: ProdigalDB(replicon, cfg)
def test_ProteinDB(self): file_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) self.assertTrue(db.replicon.id, replicon.id)
def test_ProteinDB_no_prodigal(self): file_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) self.args.prodigal = None with self.assertRaises(RuntimeError) as ctx: ProdigalDB(replicon, cfg)
def test_ProteinDB(self): # From Gembase Draft , Gembase Complete file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue(db.replicon.id, replicon.id)
def test_iter(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) idx = SeqIO.index(self.find_data(os.path.join('Proteins', prot_name)), 'fasta', alphabet=Seq.IUPAC.extended_protein) for exp_seq_id, get_seq_id in zip(idx, db): self.assertEqual(exp_seq_id, get_seq_id)
def test_ProteinDB(self): # From Gembase Draft , Gembase Complete file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue(db.replicon.id, replicon.id)
def test_get_description(self): # SeqDesc(id, strand, strat, stop) file_name = 'acba.007.p01.13' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) descriptions = {'ACBA.007.P01_13_23': SeqDesc('ACBA.007.P01_13_23', -1, 19721, 20254), 'ACBA.007.P01_13_1': SeqDesc('ACBA.007.P01_13_1', 1, 55, 1014)} for seq_id, desc in descriptions.items(): self.assertEqual(desc, db.get_description(seq_id))
def test_iter(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) idx = SeqIO.index(self.find_data(os.path.join('Proteins', prot_name)), 'fasta', alphabet=Seq.IUPAC.extended_protein) for exp_seq_id, get_seq_id in zip(idx, db): self.assertEqual(exp_seq_id, get_seq_id)
def test_make_protfile(self): file_name = (('ACBA.0917.00019', '.fna', 3870), ('ESCO001.C.00001.C001', '.fst', 3870)) for seq_name, ext, seq_nb in file_name: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) for seq_nb, seqs in enumerate(zip( read_multi_prot_fasta(self.find_data(os.path.join('Gembase', 'Proteins', seq_name + '.prt'))), read_multi_prot_fasta(db.protfile)), 1): expected, test = seqs self.assertEqual(expected.id, test.id) self.assertEqual(seq_nb, seq_nb)
def test_make_protfile(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) for seq_nb, seqs in enumerate(zip( read_multi_prot_fasta(self.find_data(os.path.join('Proteins', prot_name))), read_multi_prot_fasta(db.protfile)), 1): expected, test = seqs self.assertEqual(expected.id, test.id) self.assertEqual(seq_nb, 23)
def test_find_gembase_file_basename(self): """ test if find_gembase_file_basename get the the right basename for files in gembase """ gembase_path = self.find_data('Gembase') file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data(os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue(db._find_gembase_file_basename(gembase_path, replicon_path), os.path.splitext(file_name)[0])
def test_get_description(self): # SeqDesc(id, strand, strat, stop) file_name = 'acba.007.p01.13' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) descriptions = { 'ACBA.007.P01_13_23': SeqDesc('ACBA.007.P01_13_23', -1, 19721, 20254), 'ACBA.007.P01_13_1': SeqDesc('ACBA.007.P01_13_1', 1, 55, 1014) } for seq_id, desc in descriptions.items(): self.assertEqual(desc, db.get_description(seq_id))
def test_find_gembase_file_basename(self): """ test if find_gembase_file_basename get the the right basename for files in gembase """ gembase_path = self.find_data('Gembase') file_names = ('ACBA.0917.00019.fna', 'ESCO001.C.00001.C001.fst') for file_name in file_names: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', file_name)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) with self.catch_log(): db = GembaseDB(replicon, cfg) self.assertTrue( db._find_gembase_file_basename(gembase_path, replicon_path), os.path.splitext(file_name)[0])
def test_getitem(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data(os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) exp = read_multi_prot_fasta(self.find_data(os.path.join('Proteins', prot_name))) for prot_expected in exp: prot_received = db[prot_expected.id] self.assertEqual(prot_received.id, prot_expected.id) self.assertEqual(prot_received.seq, prot_expected.seq) with self.assertRaises(KeyError) as ctx: db['nimport_naoik'] self.assertEqual(str(ctx.exception), "'nimport_naoik'")
def test_getitem(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) exp = read_multi_prot_fasta( self.find_data(os.path.join('Proteins', prot_name))) for prot_expected in exp: prot_received = db[prot_expected.id] self.assertEqual(prot_received.id, prot_expected.id) self.assertEqual(prot_received.seq, prot_expected.seq) with self.assertRaises(KeyError) as ctx: db['nimport_naoik'] self.assertEqual(str(ctx.exception), "'nimport_naoik'")
def test_make_protfile(self): file_name = 'acba.007.p01.13' prot_name = 'ACBA.007.P01_13.prt' replicon_path = self.find_data( os.path.join('Replicons', file_name + '.fst')) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = ProdigalDB(replicon, cfg) for seq_nb, seqs in enumerate( zip( read_multi_prot_fasta( self.find_data(os.path.join('Proteins', prot_name))), read_multi_prot_fasta(db.protfile)), 1): expected, test = seqs self.assertEqual(expected.id, test.id) self.assertEqual(seq_nb, 23)
def test_get_description(self): # SeqDesc(id, strand, strat, stop) file_name = { ('ACBA.0917.00019', '.fna'): { 'ACBA.0917.00019.b0001_00001': SeqDesc('ACBA.0917.00019.b0001_00001', -1, 266, 1480), 'ACBA.0917.00019.i0001_03957': SeqDesc('ACBA.0917.00019.i0001_03957', -1, 4043755, 4044354) }, } for seq_name, ext in file_name: replicon_path = self.find_data( os.path.join('Gembase', 'Replicons', seq_name + ext)) self.args.replicon = replicon_path cfg = Config(self.args) seq_db = read_multi_prot_fasta(replicon_path) replicon = next(seq_db) replicon.path = replicon_path os.makedirs(cfg.tmp_dir(replicon.id)) db = GembaseDB(replicon, cfg) descriptions = file_name[(seq_name, ext)] for seq_id, desc in descriptions.items(): self.assertEqual(desc, db.get_description(seq_id)) with self.assertRaises(IntegronError) as ctx: db.get_description('nimport_naoik') self.assertEqual( str(ctx.exception), "'nimport_naoik' is not a valid Gembase protein identifier.") with self.assertRaises(KeyError) as ctx: db.get_description('FOO.BAR.00019.i0001_03924') self.assertEqual(str(ctx.exception), "'FOO.BAR.00019.i0001_03924'")