def check_blocks(self, old_file, new_file): h = open(old_file, "rb") old = list(bgzf.BgzfBlocks(h)) h.close() h = open(new_file, "rb") new = list(bgzf.BgzfBlocks(h)) h.close() self.assertEqual(len(old), len(new)) self.assertEqual(old, new)
def check_blocks(self, old_file, new_file): with open(old_file, "rb") as h: old = list(bgzf.BgzfBlocks(h)) with open(new_file, "rb") as h: new = list(bgzf.BgzfBlocks(h)) self.assertEqual(len(old), len(new)) self.assertEqual(old, new)
def populate_blocks(): with open(self.bgzipped_fasta_filename, 'rb') as fasta_file_for_blocks: # Store uncompressed data start, uncompressed data end, compressed block start as a tuple blocks = [(b[2], b[2] + b[3], b[0]) for b in bgzf.BgzfBlocks(fasta_file_for_blocks)] verbose_print('\t\tFound {} blocks in {}'.format( len(blocks), datetime.datetime.now() - start_time)) return blocks
def check_random(self, filename): """Check BGZF random access by reading blocks in forward & reverse order""" h = gzip.open(filename, "rb") old = h.read() h.close() h = open(filename, "rb") blocks = list(bgzf.BgzfBlocks(h)) h.close() #Forward new = _empty_bytes_string h = bgzf.BgzfReader(filename, "rb") self.assertTrue(h.seekable()) self.assertFalse(h.isatty()) self.assertEqual(h.fileno(), h._handle.fileno()) for start, raw_len, data_start, data_len in blocks: #print start, raw_len, data_start, data_len h.seek(bgzf.make_virtual_offset(start,0)) data = h.read(data_len) self.assertEqual(len(data), data_len) #self.assertEqual(start + raw_len, h._handle.tell()) self.assertEqual(len(new), data_start) new += data h.close() self.assertEqual(len(old), len(new)) self.assertEqual(old, new) #Reverse new = _empty_bytes_string h = bgzf.BgzfReader(filename, "rb") for start, raw_len, data_start, data_len in blocks[::-1]: #print start, raw_len, data_start, data_len h.seek(bgzf.make_virtual_offset(start,0)) data = h.read(data_len) self.assertEqual(len(data), data_len) #self.assertEqual(start + raw_len, h._handle.tell()) new = data + new h.close() self.assertEqual(len(old), len(new)) self.assertEqual(old, new) #Jump back - non-sequential seeking if len(blocks) >= 3: h = bgzf.BgzfReader(filename, "rb", max_cache = 1) #Seek to a late block in the file, #half way into the third last block start, raw_len, data_start, data_len = blocks[-3] voffset = bgzf.make_virtual_offset(start, data_len // 2) h.seek(voffset) self.assertEqual(voffset, h.tell()) data = h.read(1000) self.assertTrue(data in old) self.assertEqual(old.find(data), data_start + data_len // 2) #Now seek to an early block in the file, #half way into the second block start, raw_len, data_start, data_len = blocks[1] h.seek(bgzf.make_virtual_offset(start, data_len // 2)) voffset = bgzf.make_virtual_offset(start, data_len // 2) h.seek(voffset) self.assertEqual(voffset, h.tell()) #Now read all rest of this block and start of next block data = h.read(data_len + 1000) self.assertTrue(data in old) self.assertEqual(old.find(data), data_start + data_len // 2) h.close() #Check seek/tell at block boundaries v_offsets = [] for start, raw_len, data_start, data_len in blocks: for within_offset in [0, 1, data_len // 2, data_len - 1]: if within_offset < 0 or data_len <= within_offset: continue voffset = bgzf.make_virtual_offset(start, within_offset) real_offset = data_start + within_offset v_offsets.append((voffset, real_offset)) shuffle(v_offsets) h = bgzf.BgzfReader(filename, "rb", max_cache = 1) for voffset, real_offset in v_offsets: h.seek(0) assert voffset >= 0 and real_offset >= 0 self.assertEqual(h.read(real_offset), old[:real_offset]) self.assertEqual(h.tell(), voffset) for voffset, real_offset in v_offsets: h.seek(voffset) self.assertEqual(h.tell(), voffset) h.close()
def test_BgzfBlocks_TypeError(self): """Check get expected TypeError from BgzfBlocks.""" for mode in ("r", "rb"): decompressed = bgzf.open("GenBank/cor6_6.gb.bgz", mode) with self.assertRaises(TypeError): list(bgzf.BgzfBlocks(decompressed))