def test_cycle(self): block_size = 45 num_cycles = lcm(block_size, len(self.test_array)) // len( self.test_array) if num_cycles < 3: num_cycles = 4 elif num_cycles == 3: num_cycles = 6 reader = multitables.Streamer(filename=self.test_filename) ary = reader.get_generator(path=self.test_array_path, cyclic=True, block_size=block_size) result = [] for i, row in enumerate(ary): if i >= num_cycles * len(self.test_array): #print("Terminating at " + str(row[0,0])) break #print(row[0, 0]) result.append(row) assert_items_equal(self, result, list(self.test_array) * num_cycles, key=lambda x: x[0, 0]) #self.assertEqual(len(result), 4*len(self.test_array)) ary.close()
def test_cycle_ordered(self): return block_size = 45 num_cycles = lcm(block_size, len(self.test_array)) // len( self.test_array) if num_cycles < 3: num_cycles = 4 elif num_cycles == 3: num_cycles = 6 reader = multitables.Streamer(filename=self.test_filename) ary = reader.get_generator(path=self.test_array_path, cyclic=True, block_size=block_size, ordered=True) result = [] for i, row in enumerate(ary): if i >= num_cycles * len(self.test_array): #print("Terminating at " + str(row[0,0])) break #print(row[0, 0]) result.append(row) #print(np.bincount(np.array(result)[:,0,0]/100)) assert_items_equal(self, result, list(self.test_array) * num_cycles, key=None) ary.close()
def bench_generator(filename, node_path, n_procs, read_iters, **kwargs): stream = multitables.Streamer(filename, **kwargs) gen = stream.get_generator(node_path, n_procs=n_procs, cyclic=True) start = time.time() for i, row in tqdm.tqdm(enumerate(gen), total=read_iters): if i >= read_iters: break end = time.time() return end - start
def bench_direct(filename, node_path, n_procs, read_iters, **kwargs): stream = multitables.Streamer(filename, **kwargs) q = stream.get_queue(node_path, n_procs=n_procs, cyclic=True) start = time.time() for _ in tqdm.tqdm(range(0, read_iters, q.block_size)): with q.get() as block: pass end = time.time() q.close() return end - start
def __init__(self, filename, batch_size, **kw_args): """ Create a HDF5 file reader that reads batches of size ``batch_size``. The batch size is the number of elements of the outer-most dimension of the datasets that will be read. This can thought of as the number of rows that will be read at once and returned to the user. :param filename: The HDF5 file to read. :param batch_size: The size of the batches to be read. :param kw_args: Optional arguments to pass to multitables. """ self.streamer = mtb.Streamer(filename, **kw_args) self.vars = [] self.batch_size = batch_size self.queues = [] self.order_lock = None
def test_cycle(self): block_size = 45 num_cycles = lcm(block_size, len(self.test_array)) // len( self.test_array) #if num_cycles < 3: # num_cycles = 4 #elif num_cycles == 3: # num_cycles = 6 num_cycles = max(num_cycles, num_cycles * (int(100 / num_cycles) + 1)) reader = multitables.Streamer(filename=self.test_filename) ary = reader.get_generator(path=self.test_array_path, cyclic=True, block_size=block_size) result = [] for i, row in enumerate(ary): if i >= num_cycles * len(self.test_array): #print("Terminating at " + str(row[0,0])) break #print(row[0, 0]) result.append(row) assert_items_equal(self, result, list(self.test_array) * num_cycles, key=lambda x: x[0, 0]) #self.assertEqual(len(result), 4*len(self.test_array)) ary.close() return queue = reader.get_queue(path=self.test_array_path, cyclic=True, block_size=block_size) result = [] for i in range(num_cycles * len(self.test_array) // block_size): guard = queue.get() self.assertIsNot(guard, multitables.QueueClosed) with guard as batch: result.append(batch.copy()) assert_items_equal(self, np.concatenate(result, axis=0), list(self.test_array) * num_cycles, key=lambda x: x[0, 0]) queue.close()
def test_threaded(self): return block_size = len(self.test_array) // 100 reader = multitables.Streamer(filename=self.test_filename) queue = reader.get_queue(path=self.test_array_path, n_procs=4, block_size=block_size) lock = threading.Lock() result = [] def read(): while True: guard = queue.get() if guard is multitables.QueueClosed: break with guard as batch: batch_copy = batch.copy() with lock: result.append(batch_copy) threads = [] for i in range(100): threads.append(threading.Thread(target=read)) for t in threads: t.start() last_batch = reader.get_remainder(path=self.test_array_path, block_size=queue.block_size) if 100 * block_size == len(self.test_array): self.assertEqual(len(last_batch), 0) else: result.append(last_batch) for t in threads: t.join() assert_items_equal(self, result, get_batches(self.test_array, block_size), key=lambda x: x[0, 0, 0]) queue.close()
def test_generator(self): reader = multitables.Streamer(filename=self.test_filename) ary_gen = reader.get_generator(path=self.test_array_path) assert_items_equal(self, list(ary_gen), list(self.test_array), key=lambda x: x[0, 0]) ary_gen.close() table_gen = reader.get_generator(path=self.test_table_path) assert_items_equal(self, list(table_gen), list(self.test_table_ary), key=lambda x: x['col_B'][0][0]) table_gen.close()
def test_ordered(self): reader = multitables.Streamer(filename=self.test_filename) ary_gen = reader.get_generator(path=self.test_array_path, ordered=True) assert_items_equal(self, list(ary_gen), list(self.test_array), key=None) ary_gen.close() table_gen = reader.get_generator(path=self.test_table_path, ordered=True) assert_items_equal(self, list(table_gen), list(self.test_table_ary), key=None) table_gen.close()
def test_direct(self): block_size = None reader = multitables.Streamer(filename=self.test_filename) queue = reader.get_queue(path=self.test_array_path, block_size=block_size) result = [] while True: guard = queue.get() if guard is multitables.QueueClosed: break else: with guard as batch: result.append(batch.copy()) result.append( reader.get_remainder(path=self.test_array_path, block_size=queue.block_size)) assert_items_equal(self, result, get_batches(self.test_array, queue.block_size), key=lambda x: x[0, 0, 0]) queue.close() block_size = 16 queue = reader.get_queue(path=self.test_array_path, block_size=block_size) result = [] for guard in queue.iter(): with guard as batch: result.append(batch.copy()) result.append( reader.get_remainder(path=self.test_array_path, block_size=queue.block_size)) assert_items_equal(self, result, get_batches(self.test_array, queue.block_size), key=lambda x: x[0, 0, 0]) queue.close()
def test_howto(self): return kw_args = {} stream = multitables.Streamer(filename=self.test_filename, **kw_args) do_something = lambda x: x do_something_else = lambda x: x queue = stream.get_queue( path=self.test_array_path, # Path to dataset within the H5file. n_procs= 4, # Number of processes to launch for parallel reads. Defaults to 4. read_ahead= 5, # Size of internal buffer in no. of blocks. Defaults to 2*n_proc+1. cyclic= False, # A cyclic reader wraps at the end of the dataset. Defaults to False. block_size= 32, # Size (along the outer dimension) of the blocks that will be read. # Defaults to a multiple of the dataset chunk size, or a 128KB block. # Should be left to the default or carefully chosen for chunked arrays, # else performance degradation can occur. ordered= False # Force the stream to return blocks in on-disk order. Useful if two # datasets need to be read synchronously. This option may have a # performance penalty. ) while True: guard = queue.get( ) # Get the guard object, will block until data is ready. if guard is multitables.QueueClosed: break # Terminate the loop once the dataset is finished. with guard as block: # The guard returns the next block of data in the buffer. do_something(block) # Perform actions on the data while True: guard = queue.get( ) # Get the guard object, will block until data is ready. if guard is multitables.QueueClosed: break # Terminate the loop once the dataset is finished. with guard as block: # The guard returns the next block of data in the buffer. do_something(block) # Perform actions on the data for guard in queue.iter(): with guard as block: do_something(block) last_block = stream.get_remainder(self.test_array_path, queue.block_size) queue.close() queue = stream.get_queue( path=self.test_array_path, # Path to dataset within the H5file. n_procs= 2, # Number of processes to launch for parallel reads. Defaults to 2. read_ahead= 5, # Size of internal buffer in no. of blocks. Defaults to 2*n_proc+1. cyclic= True, # A cyclic reader wraps at the end of the dataset. Defaults to False. ) while True: with queue.get() as block: do_something(block) break gen = stream.get_generator(self.test_array_path, n_procs=4, read_ahead=9, cyclic=False, block_size=32) for row in gen: do_something_else(row) gen.close()
def test_quickstart(self): return do_something = lambda x: x stream = multitables.Streamer(filename=self.test_filename) for row in stream.get_generator(path=self.test_array_path): do_something(row)