def test_topology_genbank(self): """Check GenBank LOCUS line parsing.""" # This is a bit low level, but can test pasing the LOCUS line only tests = [ ("LOCUS U00096", None, None, None), # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae: ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None, "DNA", "PLN"), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear", "DNA", "BCT"), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular", "DNA", "BCT"), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear", None, "INV"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual( t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual( mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual( d, div, "Wrong division %r not %r from %r" % (d, div, line))
def test_topology_genbank(self): """Check GenBank LOCUS line parsing.""" # This is a bit low level, but can test pasing the LOCUS line only tests = [ ("LOCUS U00096", None, None, None, None), # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae: ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None, "DNA", "PLN", None), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear", "DNA", "BCT", None), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular", "DNA", "BCT", None), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear", None, "INV", None), ("LOCUS pEH010 5743 bp DNA circular", "circular", "DNA", None, [BiopythonParserWarning]), # This is a test of the format > 80 chars long ("LOCUS AZZZAA02123456789 1000000000 bp DNA linear PRI 15-OCT-2018", "linear", "DNA", "PRI", None) ] for (line, topo, mol_type, div, warning_list) in tests: with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line)) if warning_list is None: self.assertEqual(len(caught), 0) else: self.assertEqual(len(caught), len(warning_list)) for i, warning_class in enumerate(warning_list): self.assertEqual(caught[i].category, warning_class)
class GenBankScannerTests(unittest.TestCase): """GenBank Scanner tests, test parsing gbk and embl files.""" gb_s = Scanner.GenBankScanner() def gb_to_l_cds_f(self, filename, tags2id=None): """Gb file to Seq list parse CDS features.""" with open(filename) as handle: if tags2id: l_cds_f = list(self.gb_s.parse_cds_features(handle, tags2id=tags2id)) else: l_cds_f = list(self.gb_s.parse_cds_features(handle)) return l_cds_f def gb_to_l_r(self, filename, do_features=False): """Gb file to Seq list parse records.""" with open(filename) as handle: l_gb_r = list(self.gb_s.parse_records(handle, do_features=do_features)) return l_gb_r def test_genbank_cds_interaction(self): """Test CDS interaction, parse CDS features on gb(k) files.""" # Test parse CDS features on NC_000932.gb l_cds_f = self.gb_to_l_cds_f("GenBank/NC_000932.gb") # number of records, should be 85 self.assertEqual(len(l_cds_f), 85) # Seq ID self.assertEqual(l_cds_f[0].id, 'NP_051037.1') self.assertEqual(l_cds_f[84].id, 'NP_051123.1') # Test parse CDS features on NC_005816.gb, Tag to ID l_cds_f = self.gb_to_l_cds_f("GenBank/NC_005816.gb", tags2id=('gene', 'locus_tag', 'product')) # number of records, should be 10 self.assertEqual(len(l_cds_f), 10) # Seq ID self.assertEqual(l_cds_f[0].id, '<unknown id>') self.assertEqual(l_cds_f[0].name, 'YP_pPCP01') # Test parse CDS features on NC_000932.gb and NC_005816.gb combined l_cds_f1 = self.gb_to_l_cds_f("GenBank/NC_000932.gb", tags2id=('gene', 'locus_tag', 'product')) l_cds_f2 = self.gb_to_l_cds_f("GenBank/NC_005816.gb", tags2id=('gene', 'locus_tag', 'product')) l_cds_combined = l_cds_f1 + l_cds_f2 # number of records combined, should be 95 self.assertEqual(len(l_cds_combined), 95) # Seq ID self.assertEqual(l_cds_combined[0].id, 'rps12') self.assertEqual(l_cds_combined[0].description, 'ribosomal protein S12') self.assertEqual(l_cds_combined[94].id, '<unknown id>') self.assertEqual(l_cds_combined[94].description, 'hypothetical protein') def test_genbank_interaction(self): """Test GenBank records interaction on gbk files.""" # Test parse records, on NC_005816, do_features False l_r = self.gb_to_l_r("GenBank/NC_005816.gb", do_features=False) # number of records, should be 1 self.assertEqual(len(l_r), 1) self.assertEqual(l_r[0].id, 'NC_005816.1') self.assertEqual(l_r[0].name, 'NC_005816') self.assertEqual(l_r[0].description, 'Yersinia pestis biovar ' 'Microtus str. 91001 plasmid ' 'pPCP1, complete sequence') self.assertEqual(len(l_r[0].features), 0) # Test parse records on NC_005816, do_features True l_r = self.gb_to_l_r("GenBank/NC_005816.gb", do_features=True) # number of records, should be 1 self.assertEqual(len(l_r), 1) self.assertEqual(l_r[0].id, 'NC_005816.1') self.assertEqual(l_r[0].name, 'NC_005816') self.assertEqual(l_r[0].description, 'Yersinia pestis biovar ' 'Microtus str. 91001 plasmid ' 'pPCP1, complete sequence') self.assertEqual(len(l_r[0].features), 41) # Test parse records on "GenBank/NC_000932.gb", do_features False l_r = self.gb_to_l_r("GenBank/NC_000932.gb", do_features=False) # number of records, should be 1 self.assertEqual(len(l_r), 1) self.assertEqual(l_r[0].id, 'NC_000932.1') self.assertEqual(l_r[0].name, 'NC_000932') self.assertEqual(l_r[0].description, 'Arabidopsis thaliana chloroplast, ' 'complete genome') self.assertEqual(len(l_r[0].features), 0) # Test parse records on NC_000932, do_features True l_r = self.gb_to_l_r("GenBank/NC_000932.gb", do_features=True) # number of records, should be 1 self.assertEqual(len(l_r), 1) self.assertEqual(l_r[0].id, 'NC_000932.1') self.assertEqual(l_r[0].name, 'NC_000932') self.assertEqual(l_r[0].description, 'Arabidopsis thaliana chloroplast, ' 'complete genome') self.assertEqual(len(l_r[0].features), 259) def test_embl_cds_interaction(self): """Test EMBL CDS interaction, parse CDS features on embl files.""" embl_s = Scanner.EmblScanner() # Test parse CDS features on embl_file with open("EMBL/AE017046.embl") as handle_embl7046: l_cds_f = list(embl_s.parse_cds_features(handle_embl7046)) # number of records, should be 10 self.assertEqual(len(l_cds_f), 10) # Seq ID self.assertEqual(l_cds_f[0].id, 'AAS58758.1') self.assertEqual(l_cds_f[0].description, 'putative transposase') def test_embl_record_interaction(self): """Test EMBL Record interaction on embl files.""" embl_s = Scanner.EmblScanner() # Test parse records on embl_file with open("EMBL/AE017046.embl") as handle_embl7046: l_embl_r = list(embl_s.parse_records(handle_embl7046, do_features=True)) # number of records, should be 1 self.assertEqual(len(l_embl_r), 1) self.assertEqual(l_embl_r[0].id, 'AE017046.1') self.assertEqual(l_embl_r[0].description, 'Yersinia pestis biovar Microtus ' 'str. 91001 plasmid pPCP1, complete ' 'sequence.') self.assertEqual(len(l_embl_r[0].features), 29)