Beispiel #1
0
def time_resize():
    with h5py.File('foo.h5', 'w') as f:
        vf = VersionedHDF5File(f)
        with vf.stage_version('0') as sv:
            sv.create_dataset('bar', (2, 15220, 2),
                              chunks=(300, 100, 2),
                              dtype=dt, data=np.full((2, 15220, 2), 0, dtype=dt))

    with h5py.File('foo.h5', 'r+') as f:
        vf = VersionedHDF5File(f)
        with vf.stage_version('1') as sv:
            bar = sv['bar']
            bar.resize((3, 15222, 2))
Beispiel #2
0
def time_resize_and_write():
    with h5py.File('foo.h5', 'w') as f:
        vf = VersionedHDF5File(f)
        with vf.stage_version('0') as sv:
            sv.create_dataset('bar', (1, 10, 2),
                              chunks=(600, 2, 4),
                              dtype=dt, data=np.full((1, 10, 2), 0, dtype=dt))

    for i in range(1, 100):
        with h5py.File('foo.h5', 'r+') as f:
            vf = VersionedHDF5File(f)
            with vf.stage_version(str(i)) as sv:
                bar = sv['bar']
                bar.resize((1, (i+1) * 10, 2))
                bar[:, -10:, :] = np.full((1, 10, 2), i, dtype=dt)
Beispiel #3
0
    def setup(self):
        if hasattr(self, 'file'):
            self.file.close()
        if os.path.exists('bench.hdf5'):
            os.remove('bench.hdf5')

        with h5py.File('bench.hdf5', 'w') as f:
            versioned_file = VersionedHDF5File(f)

            with versioned_file.stage_version('version1') as g:
                g.create_dataset('data',
                                 data=np.arange(10000).reshape((100, 10, 10)),
                                 chunks=(3, 3, 3))

        self.file = h5py.File('bench.hdf5', 'a')
        self.versioned_file = VersionedHDF5File(self.file)
Beispiel #4
0
    def setup(self):
        with h5py.File('foo.h5', 'w') as f:
            vf = VersionedHDF5File(f)
            with vf.stage_version('0') as sv:
                sv.create_dataset('bar', data=np.random.rand(10))

            for i in range(1, 100):
                with vf.stage_version(str(i)) as sv:
                    sv['bar'][:] = np.random.rand(10)
            self.dt = np.datetime64(vf[str(50)].attrs['timestamp'])
Beispiel #5
0
def time_many_chunks():
    d0 = 2
    d1 = 15220
    d2 = 2
    shape = (d0, d1, d2)
    chunks = (600, 2, 4)
    with h5py.File('foo.h5', 'w') as f:
        vf = VersionedHDF5File(f)
        with vf.stage_version('0') as sv:
            sv.create_dataset('bar',
                              shape=shape,
                              maxshape=(None, None, None),
                              chunks=chunks,
                              dtype=dt,
                              data=np.full(shape, 0, dtype=dt))

    i = 1
    with h5py.File('foo.h5', 'r+') as f:
        vf = VersionedHDF5File(f)
        with vf.stage_version(str(i)) as sv:
            sv['bar'][:] = np.full(shape, i, dtype=dt)
Beispiel #6
0
 def time_resize_smaller(self):
     with h5py.File('bench.hdf5', 'w') as f:
         versioned_file = VersionedHDF5File(f)
         with versioned_file.stage_version('version1') as g:
             dataset = g.create_dataset('data',
                                        data=np.arange(10000).reshape(
                                            (100, 10, 10)),
                                        chunks=(3, 3, 3))
             assert isinstance(dataset, InMemoryArrayDataset) or isinstance(
                 dataset, DatasetWrapper) and isinstance(
                     dataset.dataset, InMemoryArrayDataset)
             dataset.resize((10, 10, 10))
Beispiel #7
0
def time_many_chunks_integer_index():
    d0 = 2
    d1 = 15220
    d2 = 2
    shape = (d0, d1, d2)
    chunks = (600, 2, 4)
    with h5py.File('foo.h5', 'w') as f:
        vf = VersionedHDF5File(f)
        with vf.stage_version('0') as sv:
            sv.create_dataset('bar',
                              shape=shape,
                              maxshape=(None, None, None),
                              chunks=chunks,
                              dtype=dt,
                              data=np.full(shape, 0, dtype=dt))

    i = 1
    with h5py.File('foo.h5', 'r+') as f:
        vf = VersionedHDF5File(f)
        with vf.stage_version(str(i)) as sv:
            i2 = np.random.choice(d1, 30, replace=False)
            i2 = np.sort(i2)
            sv['bar'][:, i2, :] = np.full((d0, len(i2), d2), i, dtype=dt)
Beispiel #8
0
    def setup(self, n):
        if not os.path.exists(filename):
            with h5py.File(filename, 'w') as f:
                vf = VersionedHDF5File(f)
                with vf.stage_version('init') as sv:
                    sv.create_dataset('values', shape=(0, 0), dtype='float', fillvalue=numpy.nan,
                                      chunks=(22, 100), maxshape=(None, None), compression='lzf')

            # generate some test data with around 1000 versions
            v = 1
            with h5py.File(filename, 'r+') as f:
                vf = VersionedHDF5File(f)
                for d in range(22):
                    with vf.stage_version(str(v)) as sv:
                        values_ds = sv['values']
                        values_ds.resize((values_ds.shape[0] + 1, values_ds.shape[1] + 5000))
                        values_ds[-1, -5000] = numpy.random.rand()
                        v += 1
                    for c in range(n):
                        with vf.stage_version(str(v)) as sv:
                            values_ds = sv['values']
                            idxs = numpy.random.choice(values_ds.shape[1], 50, replace=False)
                            values_ds[-1, idxs] = numpy.random.rand(50)
                            v += 1
Beispiel #9
0
def time_many_chunks_arange():
    d0 = 2
    d1 = 15220
    d2 = 2
    shape = (d0, d1, d2)
    chunks = (600, 2, 4)
    with h5py.File('foo.h5', 'w') as f:
        vf = VersionedHDF5File(f)
        with vf.stage_version('0') as sv:
            sv.create_dataset('bar',
                              shape=shape,
                              maxshape=(None, None, None),
                              chunks=chunks,
                              dtype=dt,
                              data=np.arange(np.prod(shape),
                                             dtype=dt).reshape(shape))
Beispiel #10
0
    def time_delete(self, n):
        tmp_name = tempfile.mktemp('.h5')
        shutil.copy2(filename, tmp_name)
        try:
            # want to keep only every 10th version
            versions_to_delete = []
            with h5py.File(tmp_name, 'r') as f:
                vf = VersionedHDF5File(f)
                versions = sorted([(v, vf._versions[v].attrs['timestamp']) for v in vf._versions], key=lambda t: t[1])
                for i, v in enumerate(versions):
                    if i % 10 != 0:
                        versions_to_delete.append(v[0])

            with h5py.File(tmp_name, 'r+') as f:
                delete_versions(f, versions_to_delete)
        finally:
            os.remove(tmp_name)
Beispiel #11
0
 def time_version_by_datetime(self):
     # Based on https://github.com/deshaw/versioned-hdf5/issues/170
     with h5py.File('foo.h5', 'r') as f:
         vf = VersionedHDF5File(f)
         for _ in range(100):
             _ = vf[self.dt]['bar'][:]
    def create_files(self, versions=True):
        tests = []
        msg = ""
        for c in self.compression:
            for p in self.exponents:
                for n in self.num_transactions:
                    chunk_size = 2**p
                    if versions:
                        name = f"{self.testname}_{n}_{p}_{c}"
                    else:
                        name = f"{self.testname}_{n}_{p}_{c}_no_versions"
                    filename = os.path.join(self.path, f"{name}.h5")
                    msg += f"File with {n} transactions, chunk size 2**{p} " \
                           f"and compression filter {c}"
                    try:
                        h5pyfile = h5py.File(filename, 'r')
                        msg += " exists - unable to compute creation time.\n"
                        t = 0
                    except Exception:
                        msg += " not available. Creating new file.\n"
                        # t0 = time.time()
                        t = self.testfun(n,
                                         name,
                                         chunk_size,
                                         c,
                                         versions=versions,
                                         deterministic=True)
                        # t = time.time()-t0
                        h5pyfile = h5py.File(filename, 'r')
                    if versions:
                        data = VersionedHDF5File(h5pyfile)
                        tests.append(
                            dict(num_transactions=n,
                                 chunk_size=chunk_size,
                                 compression=c,
                                 filename=filename,
                                 h5pyfile=h5pyfile,
                                 data=data,
                                 t_write=t))
                    else:
                        tests.append(
                            dict(num_transactions=n,
                                 chunk_size=chunk_size,
                                 compression=c,
                                 filename=filename,
                                 h5pyfile=h5pyfile,
                                 t_write=t))

        for test in tests:
            test['size'] = os.path.getsize(test['filename'])
            test['size_label'] = format_size(test['size'])

        if versions:
            nt = len(self.num_transactions)
            for test in tests[-nt:]:
                lengths = []
                total_size = 0
                for vname in test['data']._versions:
                    if vname != '__first_version__':
                        version = test['data'][vname]
                        group_key = list(version.keys())[0]
                        lengths.append(len(version[group_key]['val']))
                        total_size += len(version[group_key]['val'])
                test['theoretical_sizes'] = 24 * total_size

        # Removing some irrelevant info from the dictionary
        summary = []
        for test in tests:
            summary.append(
                dict((k, test[k]) for k in [
                    'num_transactions', 'filename', 'size', 'size_label',
                    't_write', 'chunk_size', 'compression'
                ]))
            test['h5pyfile'].close()

        self.tests = tests
        return summary, msg