def test_topology_genbank(self): """Check GenBank LOCUS line parsing.""" # This is a bit low level, but can test pasing the LOCUS line only tests = [ ("LOCUS U00096", None, None, None), # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae: ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None, "DNA", "PLN"), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear", "DNA", "BCT"), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular", "DNA", "BCT"), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear", None, "INV"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual( t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual( mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual( d, div, "Wrong division %r not %r from %r" % (d, div, line))
def test_embl_cds_interaction(self): """Test EMBL CDS interaction, parse CDS features on embl files.""" embl_s = Scanner.EmblScanner() # Test parse CDS features on embl_file with open("EMBL/AE017046.embl") as handle_embl7046: l_cds_f = list(embl_s.parse_cds_features(handle_embl7046)) # number of records, should be 10 self.assertEqual(len(l_cds_f), 10) # Seq ID self.assertEqual(l_cds_f[0].id, 'AAS58758.1') self.assertEqual(l_cds_f[0].description, 'putative transposase')
def test_embl_record_interaction(self): """Test EMBL Record interaction on embl files.""" embl_s = Scanner.EmblScanner() # Test parse records on embl_file with open("EMBL/AE017046.embl") as handle_embl7046: l_embl_r = list(embl_s.parse_records(handle_embl7046, do_features=True)) # number of records, should be 1 self.assertEqual(len(l_embl_r), 1) self.assertEqual(l_embl_r[0].id, 'AE017046.1') self.assertEqual(l_embl_r[0].description, 'Yersinia pestis biovar Microtus ' 'str. 91001 plasmid pPCP1, complete ' 'sequence.') self.assertEqual(len(l_embl_r[0].features), 29)
def test_topology_genbank(self): """Check GenBank LOCUS line parsing.""" # This is a bit low level, but can test pasing the LOCUS line only tests = [ ("LOCUS U00096", None, None, None, None), # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae: ("LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999", None, "DNA", "PLN", None), ("LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001", "linear", "DNA", "BCT", None), ("LOCUS NC_005816 9609 bp DNA circular BCT 21-JUL-2008", "circular", "DNA", "BCT", None), ("LOCUS SCX3_BUTOC 64 aa linear INV 16-OCT-2001", "linear", None, "INV", None), ("LOCUS pEH010 5743 bp DNA circular", "circular", "DNA", None, [BiopythonParserWarning]), # This is a test of the format > 80 chars long ("LOCUS AZZZAA02123456789 1000000000 bp DNA linear PRI 15-OCT-2018", "linear", "DNA", "PRI", None) ] for (line, topo, mol_type, div, warning_list) in tests: with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") scanner = Scanner.GenBankScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line)) if warning_list is None: self.assertEqual(len(caught), 0) else: self.assertEqual(len(caught), len(warning_list)) for i, warning_class in enumerate(warning_list): self.assertEqual(caught[i].category, warning_class)
def test_topology_embl(self): """Check EMBL ID line parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ # Modern examples with sequence version ("ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.", "linear", "mRNA", "PLN"), ("ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.", "linear", "genomic DNA", "MAM"), # Example to match GenBank example used above: ("ID U49845; SV 1; linear; genomic DNA; STD; FUN; 5028 BP.", "linear", "genomic DNA", "FUN"), # Old examples: ("ID BSUB9999 standard; circular DNA; PRO; 4214630 BP.", "circular", "DNA", "PRO"), ("ID SC10H5 standard; DNA; PRO; 4870 BP.", None, "DNA", "PRO"), # Patent example from 2016-06-10 # ftp://ftp.ebi.ac.uk/pub/databases/embl/patent/ ("ID A01679; SV 1; linear; unassigned DNA; PAT; MUS; 12 BP.", "linear", "unassigned DNA", "MUS"), # Old patent examples ("ID NRP_AX000635; PRT; NR1; 15 SQ", None, None, "NR1"), ("ID NRP0000016E; PRT; NR2; 5 SQ", None, None, "NR2"), # KIPO patent examples ("ID DI500001 STANDARD; PRT; 111 AA.", None, None, None), ("ID DI644510 standard; PRT; 1852 AA.", None, None, None), ] for (line, topo, mol_type, div) in tests: scanner = Scanner.EmblScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual( t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual( mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual( d, div, "Wrong division %r not %r from %r" % (d, div, line))
def test_first_line_imgt(self): """Check IMGT ID line parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ ("ID HLA00001 standard; DNA; HUM; 3503 BP.", None, "DNA", "HUM"), ("ID HLA00001; SV 1; standard; DNA; HUM; 3503 BP.", None, "DNA", "HUM"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner._ImgtScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual( t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual( mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual( d, div, "Wrong division %r not %r from %r" % (d, div, line))
def test_first_line_imgt(self): """Check IMGT ID line parsing.""" # This is a bit low level, but can test pasing the ID line only tests = [ ("ID HLA00001 standard; DNA; HUM; 3503 BP.", None, "DNA", "HUM"), ("ID HLA00001; SV 1; standard; DNA; HUM; 3503 BP.", None, "DNA", "HUM"), ] for (line, topo, mol_type, div) in tests: scanner = Scanner._ImgtScanner() consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner) scanner._feed_first_line(consumer, line) t = consumer.data.annotations.get('topology', None) self.assertEqual(t, topo, "Wrong topology %r not %r from %r" % (t, topo, line)) mt = consumer.data.annotations.get('molecule_type', None) self.assertEqual(mt, mol_type, "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line)) d = consumer.data.annotations.get('data_file_division', None) self.assertEqual(d, div, "Wrong division %r not %r from %r" % (d, div, line))
class GenBankScannerTests(unittest.TestCase): """GenBank Scanner tests, test parsing gbk and embl files.""" gb_s = Scanner.GenBankScanner() def gb_to_l_cds_f(self, filename, tags2id=None): """Gb file to Seq list parse CDS features.""" with open(filename) as handle: if tags2id: l_cds_f = list(self.gb_s.parse_cds_features(handle, tags2id=tags2id)) else: l_cds_f = list(self.gb_s.parse_cds_features(handle)) return l_cds_f def gb_to_l_r(self, filename, do_features=False): """Gb file to Seq list parse records.""" with open(filename) as handle: l_gb_r = list(self.gb_s.parse_records(handle, do_features=do_features)) return l_gb_r def test_genbank_cds_interaction(self): """Test CDS interaction, parse CDS features on gb(k) files.""" # Test parse CDS features on NC_000932.gb l_cds_f = self.gb_to_l_cds_f("GenBank/NC_000932.gb") # number of records, should be 85 self.assertEqual(len(l_cds_f), 85) # Seq ID self.assertEqual(l_cds_f[0].id, 'NP_051037.1') self.assertEqual(l_cds_f[84].id, 'NP_051123.1') # Test parse CDS features on NC_005816.gb, Tag to ID l_cds_f = self.gb_to_l_cds_f("GenBank/NC_005816.gb", tags2id=('gene', 'locus_tag', 'product')) # number of records, should be 10 self.assertEqual(len(l_cds_f), 10) # Seq ID self.assertEqual(l_cds_f[0].id, '<unknown id>') self.assertEqual(l_cds_f[0].name, 'YP_pPCP01') # Test parse CDS features on NC_000932.gb and NC_005816.gb combined l_cds_f1 = self.gb_to_l_cds_f("GenBank/NC_000932.gb", tags2id=('gene', 'locus_tag', 'product')) l_cds_f2 = self.gb_to_l_cds_f("GenBank/NC_005816.gb", tags2id=('gene', 'locus_tag', 'product')) l_cds_combined = l_cds_f1 + l_cds_f2 # number of records combined, should be 95 self.assertEqual(len(l_cds_combined), 95) # Seq ID self.assertEqual(l_cds_combined[0].id, 'rps12') self.assertEqual(l_cds_combined[0].description, 'ribosomal protein S12') self.assertEqual(l_cds_combined[94].id, '<unknown id>') self.assertEqual(l_cds_combined[94].description, 'hypothetical protein') def test_genbank_interaction(self): """Test GenBank records interaction on gbk files.""" # Test parse records, on NC_005816, do_features False l_r = self.gb_to_l_r("GenBank/NC_005816.gb", do_features=False) # number of records, should be 1 self.assertEqual(len(l_r), 1) self.assertEqual(l_r[0].id, 'NC_005816.1') self.assertEqual(l_r[0].name, 'NC_005816') self.assertEqual(l_r[0].description, 'Yersinia pestis biovar ' 'Microtus str. 91001 plasmid ' 'pPCP1, complete sequence') self.assertEqual(len(l_r[0].features), 0) # Test parse records on NC_005816, do_features True l_r = self.gb_to_l_r("GenBank/NC_005816.gb", do_features=True) # number of records, should be 1 self.assertEqual(len(l_r), 1) self.assertEqual(l_r[0].id, 'NC_005816.1') self.assertEqual(l_r[0].name, 'NC_005816') self.assertEqual(l_r[0].description, 'Yersinia pestis biovar ' 'Microtus str. 91001 plasmid ' 'pPCP1, complete sequence') self.assertEqual(len(l_r[0].features), 41) # Test parse records on "GenBank/NC_000932.gb", do_features False l_r = self.gb_to_l_r("GenBank/NC_000932.gb", do_features=False) # number of records, should be 1 self.assertEqual(len(l_r), 1) self.assertEqual(l_r[0].id, 'NC_000932.1') self.assertEqual(l_r[0].name, 'NC_000932') self.assertEqual(l_r[0].description, 'Arabidopsis thaliana chloroplast, ' 'complete genome') self.assertEqual(len(l_r[0].features), 0) # Test parse records on NC_000932, do_features True l_r = self.gb_to_l_r("GenBank/NC_000932.gb", do_features=True) # number of records, should be 1 self.assertEqual(len(l_r), 1) self.assertEqual(l_r[0].id, 'NC_000932.1') self.assertEqual(l_r[0].name, 'NC_000932') self.assertEqual(l_r[0].description, 'Arabidopsis thaliana chloroplast, ' 'complete genome') self.assertEqual(len(l_r[0].features), 259) def test_embl_cds_interaction(self): """Test EMBL CDS interaction, parse CDS features on embl files.""" embl_s = Scanner.EmblScanner() # Test parse CDS features on embl_file with open("EMBL/AE017046.embl") as handle_embl7046: l_cds_f = list(embl_s.parse_cds_features(handle_embl7046)) # number of records, should be 10 self.assertEqual(len(l_cds_f), 10) # Seq ID self.assertEqual(l_cds_f[0].id, 'AAS58758.1') self.assertEqual(l_cds_f[0].description, 'putative transposase') def test_embl_record_interaction(self): """Test EMBL Record interaction on embl files.""" embl_s = Scanner.EmblScanner() # Test parse records on embl_file with open("EMBL/AE017046.embl") as handle_embl7046: l_embl_r = list(embl_s.parse_records(handle_embl7046, do_features=True)) # number of records, should be 1 self.assertEqual(len(l_embl_r), 1) self.assertEqual(l_embl_r[0].id, 'AE017046.1') self.assertEqual(l_embl_r[0].description, 'Yersinia pestis biovar Microtus ' 'str. 91001 plasmid pPCP1, complete ' 'sequence.') self.assertEqual(len(l_embl_r[0].features), 29)
def tab_parser(handle, quiet=False): from Bio.GenBank import _FeatureConsumer from Bio.GenBank.utils import FeatureValueCleaner def Si_parse_tab_features(object, skip=False): """Return list of tuples for the features (if present) Each feature is returned as a tuple (key, location, qualifiers) where key and location are strings (e.g. "CDS" and "complement(join(490883..490885,1..879))") while qualifiers is a list of two string tuples (feature qualifier keys and values). Assumes you have already read to the start of the features table. """ # if object.line.rstrip() not in object.FEATURE_START_MARKERS: # if object.debug : print "Didn't find any feature table" # return [] # # while object.line.rstrip() in object.FEATURE_START_MARKERS: # object.line = object.handle.readline() features = [] line = object.line while True: if not line: break raise ValueError("Premature end of line during features table") if line[:object.HEADER_WIDTH].rstrip() in object.SEQUENCE_HEADERS: if object.debug: print "Found start of sequence" break line = line.rstrip() if line == "//": raise ValueError( "Premature end of features table, marker '//' found") if line in object.FEATURE_END_MARKERS: if object.debug: print "Found end of features" line = object.handle.readline() break if line[2:object.FEATURE_QUALIFIER_INDENT].strip() == "": print line[2:object.FEATURE_QUALIFIER_INDENT].strip() raise ValueError("Expected a feature qualifier in line '%s'" % line) if line.split()[0] in ["ID", "source"]: line = object.handle.readline() continue if skip: line = object.handle.readline() while line[:object. FEATURE_QUALIFIER_INDENT] == object.FEATURE_QUALIFIER_SPACER: line = object.handle.readline() else: #Build up a list of the lines making up this feature: feature_key = line[2:object.FEATURE_QUALIFIER_INDENT].strip() feature_lines = [line[object.FEATURE_QUALIFIER_INDENT:]] line = object.handle.readline() while line and ( line[:object.FEATURE_QUALIFIER_INDENT] == object.FEATURE_QUALIFIER_SPACER or line.rstrip() == "" ): # cope with blank lines in the midst of a feature feature_lines.append( line[object.FEATURE_QUALIFIER_INDENT:].rstrip()) line = object.handle.readline() if len(line) == 0: break #EOF feature_lines.append('/seq="N"') sys.stdout.flush() features.append( object.parse_feature(feature_key, feature_lines)) object.line = line return features def Si_feed(object, handle, consumer, do_features=True): """Feed a set of data into the consumer. This method is intended for use with the "old" code in Bio.GenBank Arguments: handle - A handle with the information to parse. consumer - The consumer that should be informed of events. do_features - Boolean, should the features be parsed? Skipping the features can be much faster. Return values: true - Passed a record false - Did not find a record """ #Should work with both EMBL and GenBank files provided the #equivalent Bio.GenBank._FeatureConsumer methods are called... # object.set_handle(handle) # if not object.find_start(): # #Could not find (another) record # consumer.data=None # print "here" # return False #We use the above class methods to parse the file into a simplified format. #The first line, header lines and any misc lines after the features will be #dealt with by GenBank / EMBL specific derived classes. #First line and header: # object._feed_first_line(consumer, object.line) # object._feed_header_lines(consumer, object.parse_header()) #Features (common to both EMBL and GenBank): if do_features: object._feed_feature_table( consumer, Si_parse_tab_features(object, skip=False)) else: Si_parse_tab_features(object, skip=True) # ignore the data #Footer and sequence # misc_lines, sequence_string = object.parse_footer() # object._feed_misc_lines(consumer, misc_lines) sequence_string = "N" consumer.sequence(sequence_string) # Calls to consumer.base_number() do nothing anyway consumer.record_end("//") length = 0 for record in consumer.data.features: if record.location.nofuzzy_end > length: length = record.location.nofuzzy_end consumer.data.seq = "N" * length # assert object.line == "//" #And we are done return True myscanner = Scanner.InsdcScanner() myscanner.set_handle(handle) myscanner.line = myscanner.handle.readline() myscanner.FEATURE_QUALIFIER_INDENT = 21 myscanner.FEATURE_QUALIFIER_SPACER = "FT" + " " * ( myscanner.FEATURE_QUALIFIER_INDENT - 2) myscanner.debug = True #featuretuples=Si_parse_tab_features(myscanner) consumer = _FeatureConsumer(use_fuzziness=1, feature_cleaner=FeatureValueCleaner()) Si_feed(myscanner, handle, consumer) return consumer.data
def tab_parser(handle, quiet=False): def Drawer_parse_tab_features(object, skip=False): features = [] line = object.line while True: if not line: break raise ValueError("Premature end of line during features table") if line[:object.HEADER_WIDTH].rstrip() in object.SEQUENCE_HEADERS: if object.debug: print("Found start of sequence") break line = line.rstrip() if line == "//": raise ValueError( "Premature end of features table, marker '//' found") if line in object.FEATURE_END_MARKERS: if object.debug: print("Found end of features") line = object.handle.readline() break if line[2:object.FEATURE_QUALIFIER_INDENT].strip() == "": print(line[2:object.FEATURE_QUALIFIER_INDENT].strip()) raise ValueError("Expected a feature qualifier in line '%s'" % line) if skip: line = object.handle.readline() while line[:object. FEATURE_QUALIFIER_INDENT] == object.FEATURE_QUALIFIER_SPACER: line = object.handle.readline() else: #Build up a list of the lines making up this feature: feature_key = line[2:object.FEATURE_QUALIFIER_INDENT].strip() feature_lines = [line[object.FEATURE_QUALIFIER_INDENT:]] line = object.handle.readline() while line[:object. FEATURE_QUALIFIER_INDENT] == object.FEATURE_QUALIFIER_SPACER or line.rstrip( ) == "": # cope with blank lines in the midst of a feature feature_lines.append( line[object.FEATURE_QUALIFIER_INDENT:].rstrip()) line = object.handle.readline() if len(line) == 0: break #EOF feature_lines.append('/seq="N"') sys.stdout.flush() features.append( object.parse_feature(feature_key, feature_lines)) object.line = line return features def Drawer_feed(object, handle, consumer, do_features=True): if do_features: object._feed_feature_table( consumer, Drawer_parse_tab_features(object, skip=False)) else: Drawer_parse_tab_features(object, skip=True) # ignore the data sequence_string = "N" consumer.sequence(sequence_string) consumer.record_end("//") length = 0 for record in consumer.data.features: if record.location.nofuzzy_end > length: length = record.location.nofuzzy_end consumer.data.seq = "N" * length return True myscanner = Scanner.InsdcScanner() myscanner.set_handle(handle) myscanner.line = myscanner.handle.readline() myscanner.FEATURE_QUALIFIER_INDENT = 21 myscanner.FEATURE_QUALIFIER_SPACER = "FT" + " " * ( myscanner.FEATURE_QUALIFIER_INDENT - 2) myscanner.debug = True consumer = _FeatureConsumer(use_fuzziness=1, feature_cleaner=FeatureValueCleaner()) Drawer_feed(myscanner, handle, consumer) return consumer.data