def test_records_begin(self): recs = {} recs[0] = SeqRecord(Seq("TCATAGGTATTTATTTTTAAATATGGTTTGCTTTATGGCTAGAA" "CACACCGATTACTTAAAATAGGATTAACC--CCCATACACTTTA" "AAAATGATTAAACAACATTTCTGCTGCTCGCTCACATTCTTCAT" "AGAAGATGACATAATGTATTTTCCTTTTGGTT"), id="mm9.chr10", name="mm9.chr10", description="", annotations={"start": 3009319, "srcSize": 129993255, "strand": 1, "size": 162}) recs[1] = SeqRecord(Seq("TCACAGATATTTACTATTAAATATGGTTTGTTATATGGTTACGG" "TTCATAGGTTACTTGGAATTGGATTAACCTTCTTATTCATTGCA" "GAATTGGTTACACTGTGTTCTTGACCTTTGCTTGTTTTCTCCAT" "GGAAACTGATGTCAAATACTTTCCCTTTGGTT"), id="oryCun1.scaffold_133159", name="oryCun1.scaffold_133159", description="", annotations={"start": 11087, "srcSize": 13221, "strand": 1, "size": 164}) fetched_recs = self.idx._get_record(34) for i in range(2): self.assertTrue(compare_record(recs[i], fetched_recs[i]))
def loop(self, filename, format): original_records = list(SeqIO.parse(open(filename, "rU"), format)) # now open a connection to load the database server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) db_name = "test_loop_%s" % filename # new namespace! db = server.new_database(db_name) count = db.load(original_records) self.assertEqual(count, len(original_records)) server.commit() #Now read them back... biosql_records = [db.lookup(name=rec.name) for rec in original_records] #And check they agree self.assertTrue(compare_records(original_records, biosql_records)) #Now write to a handle... handle = StringIO() SeqIO.write(biosql_records, handle, "gb") #Now read them back... handle.seek(0) new_records = list(SeqIO.parse(handle, "gb")) #And check they still agree self.assertEqual(len(new_records), len(original_records)) for old, new in zip(original_records, new_records): #TODO - remove this hack because we don't yet write these (yet): for key in ["comment", "references", "db_source"]: if key in old.annotations and key not in new.annotations: del old.annotations[key] self.assertTrue(compare_record(old, new)) #Done server.close()
def get_raw_check(self, filename, format, alphabet): if format in SeqIO._BinaryFormats: #This means SFF at the moment, which does not get #implement the get_raw method return handle = open(filename, "rU") raw_file = handle.read() handle.close() #Also checking the key_function here id_list = [rec.id.lower() for rec in \ SeqIO.parse(filename, format, alphabet)] rec_dict = SeqIO.index(filename, format, alphabet, key_function = lambda x : x.lower()) self.assertEqual(set(id_list), set(rec_dict.keys())) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assert_(key in rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assert_(raw.strip()) self.assert_(raw in raw_file) if format in ["ig"]: #These have a header structure and can't be parsed #individually (at least, not right now). continue rec1 = rec_dict[key] rec2 = SeqIO.read(StringIO(raw), format, alphabet) self.assertEqual(True, compare_record(rec1, rec2))
def get_raw_check(self, filename, format, alphabet): handle = open(filename, "rb") raw_file = handle.read() handle.close() #Also checking the key_function here id_list = [rec.id.lower() for rec in \ SeqIO.parse(filename, format, alphabet)] rec_dict = SeqIO.index(filename, format, alphabet, key_function = lambda x : x.lower()) self.assertEqual(set(id_list), set(rec_dict.keys())) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertTrue(key in rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(raw.strip()) self.assertTrue(raw in raw_file) rec1 = rec_dict[key] #Following isn't very elegant, but it lets me test the #__getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(_as_bytes("<entry "))) self.assertTrue(raw.endswith(_as_bytes("</entry>"))) #Currently the __getitem__ method uses this #trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict._proxy._handle.close() #TODO - Better solution del rec_dict
def test_multi_ex_index(self): """Index SwissProt text and uniprot XML versions of several examples.""" txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss")) xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml")) ids = [x.strip() for x in open("SwissProt/multi_ex.list")] txt_index = SeqIO.index("SwissProt/multi_ex.txt", "swiss") xml_index = SeqIO.index("SwissProt/multi_ex.xml", "uniprot-xml") self.assertEqual(sorted(txt_index), sorted(ids)) self.assertEqual(sorted(xml_index), sorted(ids)) #Check SeqIO.parse() versus SeqIO.index() for plain text "swiss" for old in txt_list: new = txt_index[old.id] compare_record(old, new) #Check SeqIO.parse() versus SeqIO.index() for XML "uniprot-xml" for old in xml_list: new = xml_index[old.id] compare_record(old, new)
def check(self, t_format, t_filename, t_count=1): db = self.db iterator = SeqIO.parse(handle=open(t_filename, "r"), format=t_format) count = db.load(iterator) assert count == t_count self.server.commit() iterator = SeqIO.parse(handle=open(t_filename, "r"), format=t_format) for record in iterator: #print " - %s, %s" % (checksum_summary(record), record.id) key = record.name #print " - Retrieving by name/display_id '%s'," % key, db_rec = db.lookup(name=key) compare_record(record, db_rec) db_rec = db.lookup(display_id=key) compare_record(record, db_rec) key = record.id if key.count(".") == 1 and key.split(".")[1].isdigit(): #print " - Retrieving by version '%s'," % key, db_rec = db.lookup(version=key) compare_record(record, db_rec) if "accessions" in record.annotations: #Only expect FIRST accession to work! key = record.annotations["accessions"][0] assert key, "Blank accession in annotation %s" % repr(record.annotations) if key != record.id: #print " - Retrieving by accession '%s'," % key, db_rec = db.lookup(accession=key) compare_record(record, db_rec) if "gi" in record.annotations: key = record.annotations['gi'] if key != record.id: #print " - Retrieving by GI '%s'," % key, db_rec = db.lookup(primary_id=key) compare_record(record, db_rec)
def check_rewrite(self, filename): old = SeqIO.read(filename, "embl") #TODO - Check these properties: old.dbxrefs = [] old.annotations['accessions'] = old.annotations['accessions'][:1] del old.annotations['references'] buffer = StringIO() self.assertEqual(1, SeqIO.write(old, buffer, "embl")) buffer.seek(0) new = SeqIO.read(buffer, "embl") self.assertTrue(compare_record(old, new))
iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format) count = db.load(iterator) assert count == t_count #print " - Committing %i records" % count server.commit() iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format) for record in iterator : print " - %s, %s" % (checksum_summary(record), record.id) key = record.name print " - Retrieving by name/display_id '%s'," % key, db_rec = db.lookup(name=key) compare_record(record, db_rec) db_rec = db.lookup(display_id=key) compare_record(record, db_rec) print "OK" key = record.id if key.count(".")==1 and key.split(".")[1].isdigit() : print " - Retrieving by version '%s'," % key, db_rec = db.lookup(version=key) compare_record(record, db_rec) print "OK" if "accessions" in record.annotations : accs = set(record.annotations["accessions"]) for key in accs : assert key, "Blank accession in annotation %s" % repr(accs)
def get_raw_check(self, filename, format, alphabet, comp): # Also checking the key_function here if comp: h = gzip.open(filename, "rb") raw_file = h.read() h.close() h = gzip_open(filename, format) id_list = [ rec.id.lower() for rec in SeqIO.parse(h, format, alphabet) ] h.close() else: h = open(filename, "rb") raw_file = h.read() h.close() id_list = [ rec.id.lower() for rec in SeqIO.parse(filename, format, alphabet) ] if format in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) else: rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) self.assertEqual(set(id_list), set(rec_dict)) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertTrue(key in rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(isinstance(raw, bytes), "Didn't get bytes from %s get_raw" % format) self.assertTrue(raw.strip()) self.assertTrue(raw in raw_file) rec1 = rec_dict[key] # Following isn't very elegant, but it lets me test the # __getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(_as_bytes("<entry "))) self.assertTrue(raw.endswith(_as_bytes("</entry>"))) # Currently the __getitem__ method uses this # trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict
iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format) count = db.load(iterator) assert count == t_count db_count += count #print " - Committing %i records" % count server.commit() iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format) for record in iterator: print " - %s, %s" % (checksum_summary(record), record.id) key = record.name print " - Retrieving by name/display_id '%s'," % key, db_rec = db.lookup(name=key) compare_record(record, db_rec) db_rec = db.lookup(display_id=key) compare_record(record, db_rec) print "OK" key = record.id if key.count(".")==1 and key.split(".")[1].isdigit(): print " - Retrieving by version '%s'," % key, db_rec = db.lookup(version=key) compare_record(record, db_rec) print "OK" if "accessions" in record.annotations: accs = sorted(set(record.annotations["accessions"])) for key in accs: assert key, "Blank accession in annotation %s" % repr(accs)
def get_raw_check(self, filename, format, alphabet, comp): # Also checking the key_function here if comp: h = gzip.open(filename, "rb") raw_file = h.read() h.close() h = gzip_open(filename, format) id_list = [rec.id.lower() for rec in SeqIO.parse(h, format, alphabet)] h.close() else: h = open(filename, "rb") raw_file = h.read() h.close() id_list = [rec.id.lower() for rec in SeqIO.parse(filename, format, alphabet)] if format in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet, key_function=lambda x: x.lower()) else: rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet, key_function=lambda x: x.lower()) self.assertEqual(set(id_list), set(rec_dict)) self.assertEqual(set(id_list), set(rec_dict_db)) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertIn(key, rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(isinstance(raw, bytes), "Didn't get bytes from %s get_raw" % format) self.assertTrue(raw.strip()) self.assertIn(raw, raw_file) raw_db = rec_dict_db.get_raw(key) # Via index using format-specific get_raw which scans the file, # Via index_db in general using raw length found when indexing. self.assertEqual(raw, raw_db, "index and index_db .get_raw() different for %s" % format) rec1 = rec_dict[key] # Following isn't very elegant, but it lets me test the # __getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(b"<entry ")) self.assertTrue(raw.endswith(b"</entry>")) # Currently the __getitem__ method uses this # trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict
def test_records_end(self): recs = {} recs[0] = SeqRecord(Seq("TGTTTAGTACC----ATGCTTAGGAATGATAAACTCACTTAGTGtt"), id="mm9.chr10", name="mm9.chr10", description="", annotations={"start": 3021494, "srcSize": 129993255, "strand": 1, "size": 42}) recs[1] = SeqRecord(Seq("TGTTGCATGTCCTTTATTCTTTGGCGTGATAGGCTCACCCAATCTT"), id="ponAbe2.chr6", name="ponAbe2.chr6", description="", annotations={"start": 16173516, "srcSize": 174210431, "strand": -1, "size": 46}) recs[2] = SeqRecord(Seq("TGTTGCATATCCTTTATTCTTTGGCGTGATAGGCTCACCCAATCTT"), id="panTro2.chr6", name="panTro2.chr6", description="", annotations={"start": 16393864, "srcSize": 173908612, "strand": -1, "size": 46}) recs[3] = SeqRecord(Seq("TGTTGCATGTCGTTTATTCTTTGGCGTGATAGGCTCACCCAATCTT"), id="hg18.chr6", name="hg18.chr6", description="", annotations={"start": 15875298, "srcSize": 170899992, "strand": -1, "size": 46}) recs[4] = SeqRecord(Seq("TGTTAAGTCTCACTTGCTGTTCAAAGTGATAGCTTCACTCCATCAT"), id="canFam2.chr1", name="canFam2.chr1", description="", annotations={"start": 78072287, "srcSize": 125616256, "strand": -1, "size": 46}) recs[5] = SeqRecord(Seq("TGTTTAAAATG----ATTGCTAGAACTTCTA--CTCACTGGA----"), id="ornAna1.chr2", name="ornAna1.chr2", description="", annotations={"start": 14757144, "srcSize": 54797317, "strand": -1, "size": 36}) fetched_recs = self.idx._get_record(99228) for i in range(6): self.assertTrue(compare_record(recs[i], fetched_recs[i]))
def test_records_end(self): recs = {} recs[0] = SeqRecord( Seq("TGTTTAGTACC----ATGCTTAGGAATGATAAACTCACTTAGTGtt"), id="mm9.chr10", name="mm9.chr10", description="", annotations={ "start": 3021494, "srcSize": 129993255, "strand": 1, "size": 42 }) recs[1] = SeqRecord( Seq("TGTTGCATGTCCTTTATTCTTTGGCGTGATAGGCTCACCCAATCTT"), id="ponAbe2.chr6", name="ponAbe2.chr6", description="", annotations={ "start": 16173516, "srcSize": 174210431, "strand": -1, "size": 46 }) recs[2] = SeqRecord( Seq("TGTTGCATATCCTTTATTCTTTGGCGTGATAGGCTCACCCAATCTT"), id="panTro2.chr6", name="panTro2.chr6", description="", annotations={ "start": 16393864, "srcSize": 173908612, "strand": -1, "size": 46 }) recs[3] = SeqRecord( Seq("TGTTGCATGTCGTTTATTCTTTGGCGTGATAGGCTCACCCAATCTT"), id="hg18.chr6", name="hg18.chr6", description="", annotations={ "start": 15875298, "srcSize": 170899992, "strand": -1, "size": 46 }) recs[4] = SeqRecord( Seq("TGTTAAGTCTCACTTGCTGTTCAAAGTGATAGCTTCACTCCATCAT"), id="canFam2.chr1", name="canFam2.chr1", description="", annotations={ "start": 78072287, "srcSize": 125616256, "strand": -1, "size": 46 }) recs[5] = SeqRecord( Seq("TGTTTAAAATG----ATTGCTAGAACTTCTA--CTCACTGGA----"), id="ornAna1.chr2", name="ornAna1.chr2", description="", annotations={ "start": 14757144, "srcSize": 54797317, "strand": -1, "size": 36 }) fetched_recs = self.idx._get_record(99228) for i in range(6): self.assertTrue(compare_record(recs[i], fetched_recs[i]))
iterator = SeqIO.parse(handle=open(t_filename, "r"), format=t_format) count = db.load(iterator) assert count == t_count db_count += count #print " - Committing %i records" % count server.commit() iterator = SeqIO.parse(handle=open(t_filename, "r"), format=t_format) for record in iterator: print " - %s, %s" % (checksum_summary(record), record.id) key = record.name print " - Retrieving by name/display_id '%s'," % key, db_rec = db.lookup(name=key) compare_record(record, db_rec) db_rec = db.lookup(display_id=key) compare_record(record, db_rec) print "OK" key = record.id if key.count(".") == 1 and key.split(".")[1].isdigit(): print " - Retrieving by version '%s'," % key, db_rec = db.lookup(version=key) compare_record(record, db_rec) print "OK" if "accessions" in record.annotations: accs = set(record.annotations["accessions"]) for key in accs: assert key, "Blank accession in annotation %s" % repr(accs)
def get_raw_check(self, filename, format, alphabet, comp): # Also checking the key_function here if comp: with gzip.open(filename, "rb") as handle: raw_file = handle.read() with gzip_open(filename, format) as handle: id_list = [ rec.id.lower() for rec in SeqIO.parse(handle, format, alphabet) ] else: with open(filename, "rb") as handle: raw_file = handle.read() id_list = [ rec.id.lower() for rec in SeqIO.parse(filename, format, alphabet) ] if format in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter("ignore", BiopythonParserWarning) rec_dict = SeqIO.index( filename, format, alphabet, key_function=lambda x: x.lower()) # noqa: E731 if sqlite3: rec_dict_db = SeqIO.index_db( ":memory:", filename, format, alphabet, key_function=lambda x: x.lower()) # noqa: E731 else: rec_dict = SeqIO.index( filename, format, alphabet, key_function=lambda x: x.lower()) # noqa: E731 if sqlite3: rec_dict_db = SeqIO.index_db( ":memory:", filename, format, alphabet, key_function=lambda x: x.lower()) # noqa: E731 self.assertEqual(set(id_list), set(rec_dict)) if sqlite3: self.assertEqual(set(id_list), set(rec_dict_db)) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertIn(key, rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(isinstance(raw, bytes), "Didn't get bytes from %s get_raw" % format) self.assertTrue(raw.strip()) self.assertIn(raw, raw_file) if sqlite3: raw_db = rec_dict_db.get_raw(key) # Via index using format-specific get_raw which scans the file, # Via index_db in general using raw length found when indexing. self.assertEqual( raw, raw_db, "index and index_db .get_raw() different for %s" % format) rec1 = rec_dict[key] # Following isn't very elegant, but it lets me test the # __getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(raw.decode()) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(b"<entry ")) self.assertTrue(raw.endswith(b"</entry>")) # Currently the __getitem__ method uses this # trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % raw.decode() handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict
def get_raw_check(self, filename, fmt, comp): # Also checking the key_function here msg = "Test failure parsing file %s with format %s" % (filename, fmt) if comp: with gzip.open(filename, "rb") as handle: raw_file = handle.read() mode = "r" + self.get_mode(fmt) with gzip.open(filename, mode) as handle: id_list = [rec.id.lower() for rec in SeqIO.parse(handle, fmt)] else: with open(filename, "rb") as handle: raw_file = handle.read() id_list = [rec.id.lower() for rec in SeqIO.parse(filename, fmt)] if fmt in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter("ignore", BiopythonParserWarning) rec_dict = SeqIO.index(filename, fmt, key_function=str.lower) if sqlite3: rec_dict_db = SeqIO.index_db( ":memory:", filename, fmt, key_function=str.lower, ) else: rec_dict = SeqIO.index(filename, fmt, key_function=str.lower) if sqlite3: rec_dict_db = SeqIO.index_db( ":memory:", filename, fmt, key_function=str.lower, ) self.assertEqual(set(id_list), set(rec_dict), msg=msg) if sqlite3: self.assertEqual(set(id_list), set(rec_dict_db), msg=msg) self.assertEqual(len(id_list), len(rec_dict), msg=msg) for key in id_list: self.assertIn(key, rec_dict, msg=msg) self.assertEqual(key, rec_dict[key].id.lower(), msg=msg) self.assertEqual(key, rec_dict.get(key).id.lower(), msg=msg) raw = rec_dict.get_raw(key) self.assertIsInstance(raw, bytes, msg=msg) self.assertTrue(raw.strip(), msg=msg) self.assertIn(raw, raw_file, msg=msg) if sqlite3: raw_db = rec_dict_db.get_raw(key) # Via index using format-specific get_raw which scans the file, # Via index_db in general using raw length found when indexing. self.assertEqual(raw, raw_db, msg=msg) rec1 = rec_dict[key] # Following isn't very elegant, but it lets me test the # __getitem__ SFF code is working. mode = self.get_mode(fmt) if mode == "b": handle = BytesIO(raw) elif mode == "t": handle = StringIO(raw.decode()) else: raise RuntimeError("Unexpected mode %s" % mode) if fmt == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, trim=False, ) elif fmt == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record( handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, trim=True, ) elif fmt == "uniprot-xml": self.assertTrue(raw.startswith(b"<entry "), msg=msg) self.assertTrue(raw.endswith(b"</entry>"), msg=msg) # Currently the __getitem__ method uses this # trick too, but we hope to fix that later raw = ("""<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % raw.decode()) handle = StringIO(raw) rec2 = SeqIO.read(handle, fmt) else: rec2 = SeqIO.read(handle, fmt) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict