Exemple #1
0
    def test_cycle(self):
        block_size = 45
        num_cycles = lcm(block_size, len(self.test_array)) // len(
            self.test_array)
        if num_cycles < 3:
            num_cycles = 4
        elif num_cycles == 3:
            num_cycles = 6
        reader = multitables.Streamer(filename=self.test_filename)

        ary = reader.get_generator(path=self.test_array_path,
                                   cyclic=True,
                                   block_size=block_size)

        result = []
        for i, row in enumerate(ary):
            if i >= num_cycles * len(self.test_array):
                #print("Terminating at " + str(row[0,0]))
                break
            #print(row[0, 0])
            result.append(row)

        assert_items_equal(self,
                           result,
                           list(self.test_array) * num_cycles,
                           key=lambda x: x[0, 0])
        #self.assertEqual(len(result), 4*len(self.test_array))
        ary.close()
    def test_cycle_ordered(self):
        return
        block_size = 45
        num_cycles = lcm(block_size, len(self.test_array)) // len(
            self.test_array)
        if num_cycles < 3:
            num_cycles = 4
        elif num_cycles == 3:
            num_cycles = 6
        reader = multitables.Streamer(filename=self.test_filename)

        ary = reader.get_generator(path=self.test_array_path,
                                   cyclic=True,
                                   block_size=block_size,
                                   ordered=True)

        result = []
        for i, row in enumerate(ary):
            if i >= num_cycles * len(self.test_array):
                #print("Terminating at " + str(row[0,0]))
                break
            #print(row[0, 0])
            result.append(row)

        #print(np.bincount(np.array(result)[:,0,0]/100))

        assert_items_equal(self,
                           result,
                           list(self.test_array) * num_cycles,
                           key=None)

        ary.close()
def bench_generator(filename, node_path, n_procs, read_iters, **kwargs):
    stream = multitables.Streamer(filename, **kwargs)
    gen = stream.get_generator(node_path, n_procs=n_procs, cyclic=True)

    start = time.time()
    for i, row in tqdm.tqdm(enumerate(gen), total=read_iters):
        if i >= read_iters:
            break
    end = time.time()

    return end - start
def bench_direct(filename, node_path, n_procs, read_iters, **kwargs):
    stream = multitables.Streamer(filename, **kwargs)
    q = stream.get_queue(node_path, n_procs=n_procs, cyclic=True)

    start = time.time()
    for _ in tqdm.tqdm(range(0, read_iters, q.block_size)):
        with q.get() as block:
            pass
    end = time.time()

    q.close()

    return end - start
Exemple #5
0
    def __init__(self, filename, batch_size, **kw_args):
        """
        Create a HDF5 file reader that reads batches of size ``batch_size``.
        The batch size is the number of elements of the outer-most dimension of the datasets that
        will be read. This can thought of as the number of rows that will be read at once and returned
        to the user.

        :param filename: The HDF5 file to read.
        :param batch_size: The size of the batches to be read.
        :param kw_args: Optional arguments to pass to multitables.
        """
        self.streamer = mtb.Streamer(filename, **kw_args)
        self.vars = []
        self.batch_size = batch_size
        self.queues = []
        self.order_lock = None
    def test_cycle(self):
        block_size = 45
        num_cycles = lcm(block_size, len(self.test_array)) // len(
            self.test_array)
        #if num_cycles < 3:
        #    num_cycles = 4
        #elif num_cycles == 3:
        #    num_cycles = 6
        num_cycles = max(num_cycles, num_cycles * (int(100 / num_cycles) + 1))
        reader = multitables.Streamer(filename=self.test_filename)

        ary = reader.get_generator(path=self.test_array_path,
                                   cyclic=True,
                                   block_size=block_size)

        result = []
        for i, row in enumerate(ary):
            if i >= num_cycles * len(self.test_array):
                #print("Terminating at " + str(row[0,0]))
                break
            #print(row[0, 0])
            result.append(row)

        assert_items_equal(self,
                           result,
                           list(self.test_array) * num_cycles,
                           key=lambda x: x[0, 0])
        #self.assertEqual(len(result), 4*len(self.test_array))
        ary.close()
        return
        queue = reader.get_queue(path=self.test_array_path,
                                 cyclic=True,
                                 block_size=block_size)

        result = []
        for i in range(num_cycles * len(self.test_array) // block_size):
            guard = queue.get()
            self.assertIsNot(guard, multitables.QueueClosed)
            with guard as batch:
                result.append(batch.copy())

        assert_items_equal(self,
                           np.concatenate(result, axis=0),
                           list(self.test_array) * num_cycles,
                           key=lambda x: x[0, 0])
        queue.close()
    def test_threaded(self):
        return
        block_size = len(self.test_array) // 100
        reader = multitables.Streamer(filename=self.test_filename)

        queue = reader.get_queue(path=self.test_array_path,
                                 n_procs=4,
                                 block_size=block_size)

        lock = threading.Lock()
        result = []

        def read():
            while True:
                guard = queue.get()
                if guard is multitables.QueueClosed:
                    break
                with guard as batch:
                    batch_copy = batch.copy()
                with lock:
                    result.append(batch_copy)

        threads = []
        for i in range(100):
            threads.append(threading.Thread(target=read))

        for t in threads:
            t.start()

        last_batch = reader.get_remainder(path=self.test_array_path,
                                          block_size=queue.block_size)
        if 100 * block_size == len(self.test_array):
            self.assertEqual(len(last_batch), 0)
        else:
            result.append(last_batch)

        for t in threads:
            t.join()

        assert_items_equal(self,
                           result,
                           get_batches(self.test_array, block_size),
                           key=lambda x: x[0, 0, 0])

        queue.close()
    def test_generator(self):
        reader = multitables.Streamer(filename=self.test_filename)

        ary_gen = reader.get_generator(path=self.test_array_path)

        assert_items_equal(self,
                           list(ary_gen),
                           list(self.test_array),
                           key=lambda x: x[0, 0])

        ary_gen.close()

        table_gen = reader.get_generator(path=self.test_table_path)

        assert_items_equal(self,
                           list(table_gen),
                           list(self.test_table_ary),
                           key=lambda x: x['col_B'][0][0])

        table_gen.close()
    def test_ordered(self):
        reader = multitables.Streamer(filename=self.test_filename)

        ary_gen = reader.get_generator(path=self.test_array_path, ordered=True)

        assert_items_equal(self,
                           list(ary_gen),
                           list(self.test_array),
                           key=None)

        ary_gen.close()

        table_gen = reader.get_generator(path=self.test_table_path,
                                         ordered=True)

        assert_items_equal(self,
                           list(table_gen),
                           list(self.test_table_ary),
                           key=None)

        table_gen.close()
    def test_direct(self):
        block_size = None
        reader = multitables.Streamer(filename=self.test_filename)

        queue = reader.get_queue(path=self.test_array_path,
                                 block_size=block_size)

        result = []
        while True:
            guard = queue.get()
            if guard is multitables.QueueClosed:
                break
            else:
                with guard as batch:
                    result.append(batch.copy())
        result.append(
            reader.get_remainder(path=self.test_array_path,
                                 block_size=queue.block_size))
        assert_items_equal(self,
                           result,
                           get_batches(self.test_array, queue.block_size),
                           key=lambda x: x[0, 0, 0])
        queue.close()

        block_size = 16
        queue = reader.get_queue(path=self.test_array_path,
                                 block_size=block_size)

        result = []
        for guard in queue.iter():
            with guard as batch:
                result.append(batch.copy())
        result.append(
            reader.get_remainder(path=self.test_array_path,
                                 block_size=queue.block_size))
        assert_items_equal(self,
                           result,
                           get_batches(self.test_array, queue.block_size),
                           key=lambda x: x[0, 0, 0])
        queue.close()
    def test_howto(self):
        return
        kw_args = {}
        stream = multitables.Streamer(filename=self.test_filename, **kw_args)
        do_something = lambda x: x
        do_something_else = lambda x: x

        queue = stream.get_queue(
            path=self.test_array_path,  # Path to dataset within the H5file.
            n_procs=
            4,  # Number of processes to launch for parallel reads. Defaults to 4.
            read_ahead=
            5,  # Size of internal buffer in no. of blocks. Defaults to 2*n_proc+1.
            cyclic=
            False,  # A cyclic reader wraps at the end of the dataset. Defaults to False.
            block_size=
            32,  # Size (along the outer dimension) of the blocks that will be read.
            # Defaults to a multiple of the dataset chunk size, or a 128KB block.
            # Should be left to the default or carefully chosen for chunked arrays,
            # else performance degradation can occur.
            ordered=
            False  # Force the stream to return blocks in on-disk order. Useful if two
            # datasets need to be read synchronously. This option may have a
            # performance penalty.
        )

        while True:
            guard = queue.get(
            )  # Get the guard object, will block until data is ready.
            if guard is multitables.QueueClosed:
                break  # Terminate the loop once the dataset is finished.
            with guard as block:  # The guard returns the next block of data in the buffer.
                do_something(block)  # Perform actions on the data

        while True:
            guard = queue.get(
            )  # Get the guard object, will block until data is ready.
            if guard is multitables.QueueClosed:
                break  # Terminate the loop once the dataset is finished.
            with guard as block:  # The guard returns the next block of data in the buffer.
                do_something(block)  # Perform actions on the data

        for guard in queue.iter():
            with guard as block:
                do_something(block)

        last_block = stream.get_remainder(self.test_array_path,
                                          queue.block_size)

        queue.close()

        queue = stream.get_queue(
            path=self.test_array_path,  # Path to dataset within the H5file.
            n_procs=
            2,  # Number of processes to launch for parallel reads. Defaults to 2.
            read_ahead=
            5,  # Size of internal buffer in no. of blocks. Defaults to 2*n_proc+1.
            cyclic=
            True,  # A cyclic reader wraps at the end of the dataset. Defaults to False.
        )

        while True:
            with queue.get() as block:
                do_something(block)
                break

        gen = stream.get_generator(self.test_array_path,
                                   n_procs=4,
                                   read_ahead=9,
                                   cyclic=False,
                                   block_size=32)

        for row in gen:
            do_something_else(row)

        gen.close()
 def test_quickstart(self):
     return
     do_something = lambda x: x
     stream = multitables.Streamer(filename=self.test_filename)
     for row in stream.get_generator(path=self.test_array_path):
         do_something(row)