Ejemplo n.º 1
0
 def make_vds(self, f):
     # virtual dataset
     layout = h5.VirtualLayout((2, 10), 'f4')
     vsource1 = h5.VirtualSource(self.f1, 'data', shape=(10, ))
     vsource2 = h5.VirtualSource(self.f2, 'data', shape=(10, ))
     layout[0] = vsource1
     layout[1] = vsource2
     f.create_virtual_dataset('virtual', layout)
Ejemplo n.º 2
0
def combine_h5(h5dir, out_h5file):
    filelist = list(pathlib.Path(h5dir).glob('*.h5'))
    unicode = h5py.special_dtype(vlen=str)
    n_files = len(filelist)

    # Get total no. of utts (spks) and no. of frames in .h5 files in the folder
    n_utts = list()
    n_frames = list()
    for i in range(n_files):
        with h5py.File(filelist[i], 'r') as f:
            n_utts.append(len(f['utt_ids']))
            n_frames.append(f['mfcc'].shape[0])
            mfcc_dim = f['mfcc'].shape[1]
    tot_n_utts = np.sum(n_utts)
    tot_n_frames = np.sum(n_frames)
    print(f"Total no. of utts = {tot_n_utts}")
    print(f"Total no. of frames = {tot_n_frames}")
    print(f"MFCC dim = {mfcc_dim}")

    # Assemble virtual dataset
    utt_layout = h5py.VirtualLayout(shape=(tot_n_utts, ), dtype=unicode)
    spk_layout = h5py.VirtualLayout(shape=(tot_n_utts, ), dtype=unicode)
    pos_layout = h5py.VirtualLayout(shape=(tot_n_utts, 2), dtype="int64")
    mfc_layout = h5py.VirtualLayout(shape=(tot_n_frames, mfcc_dim),
                                    dtype="float32")
    k1 = 0
    k2 = 0
    for i in range(n_files):
        print(f"Reading {filelist[i]}")
        range1 = range(k1, k1 + n_utts[i])
        range2 = range(k2, k2 + n_frames[i])
        print(f"spk_ids: {range1}")
        print(f"mfcc: {range2}")
        utt_layout[range1] = h5py.VirtualSource(filelist[i],
                                                "utt_ids",
                                                shape=(n_utts[i], ))
        spk_layout[range1] = h5py.VirtualSource(filelist[i],
                                                "spk_ids",
                                                shape=(n_utts[i], ))
        pos_layout[range1] = h5py.VirtualSource(filelist[i],
                                                "positions",
                                                shape=(n_utts[i], 2))
        mfc_layout[range2] = h5py.VirtualSource(filelist[i],
                                                "mfcc",
                                                shape=(n_frames[i], mfcc_dim))
        k1 = k1 + n_utts[i]
        k2 = k2 + n_frames[i]

    # Add virtual dataset to output file
    with h5py.File(out_h5file, "w", libver="latest") as f:
        print(f"Writing combined file {out_h5file}")
        f.create_virtual_dataset("utt_ids", utt_layout, fillvalue=None)
        f.create_virtual_dataset("spk_ids", spk_layout, fillvalue=None)
        f.create_virtual_dataset("positions", pos_layout, fillvalue=None)
        f.create_virtual_dataset("mfcc", mfc_layout, fillvalue=None)
Ejemplo n.º 3
0
    def createResource(cls, directory):
        filename = os.path.join(directory, "base.h5")
        extH5FileName = os.path.join(directory, "base__external.h5")
        extDatFileName = os.path.join(directory, "base__external.dat")

        externalh5 = h5py.File(extH5FileName, mode="w")
        externalh5["target/dataset"] = 50
        externalh5["target/link"] = h5py.SoftLink("/target/dataset")
        externalh5["/ext/vds0"] = [0, 1]
        externalh5["/ext/vds1"] = [2, 3]
        externalh5.close()

        numpy.array([0, 1, 10, 10, 2, 3]).tofile(extDatFileName)

        h5 = h5py.File(filename, mode="w")
        h5["group/dataset"] = 50
        h5["link/soft_link"] = h5py.SoftLink("/group/dataset")
        h5["link/soft_link_to_group"] = h5py.SoftLink("/group")
        h5["link/soft_link_to_link"] = h5py.SoftLink("/link/soft_link")
        h5["link/soft_link_to_file"] = h5py.SoftLink("/")
        h5["group/soft_link_relative"] = h5py.SoftLink("dataset")
        h5["link/external_link"] = h5py.ExternalLink(extH5FileName,
                                                     "/target/dataset")
        h5["link/external_link_to_link"] = h5py.ExternalLink(
            extH5FileName, "/target/link")
        h5["broken_link/external_broken_file"] = h5py.ExternalLink(
            extH5FileName + "_not_exists", "/target/link")
        h5["broken_link/external_broken_link"] = h5py.ExternalLink(
            extH5FileName, "/target/not_exists")
        h5["broken_link/soft_broken_link"] = h5py.SoftLink("/group/not_exists")
        h5["broken_link/soft_link_to_broken_link"] = h5py.SoftLink(
            "/group/not_exists")
        layout = h5py.VirtualLayout((2, 2), dtype=int)
        layout[0] = h5py.VirtualSource("base__external.h5",
                                       name="/ext/vds0",
                                       shape=(2, ),
                                       dtype=int)
        layout[1] = h5py.VirtualSource("base__external.h5",
                                       name="/ext/vds1",
                                       shape=(2, ),
                                       dtype=int)
        h5.create_group("/ext")
        h5["/ext"].create_virtual_dataset("virtual", layout)
        external = [("base__external.dat", 0, 2 * 8),
                    ("base__external.dat", 4 * 8, 2 * 8)]
        h5["/ext"].create_dataset("raw",
                                  shape=(2, 2),
                                  dtype=int,
                                  external=external)
        h5.close()

        return filename
Ejemplo n.º 4
0
    def test_extra_args(self):
        with h5.File(name='f1', driver='core', backing_store=False) as ftest:
            ftest['a'] = [1, 2, 3]
            a = ftest['a']

            with self.assertRaises(TypeError):
                h5.VirtualSource(a, 'b')
            with self.assertRaises(TypeError):
                h5.VirtualSource(a, shape=(1, ))
            with self.assertRaises(TypeError):
                h5.VirtualSource(a, maxshape=(None, ))
            with self.assertRaises(TypeError):
                h5.VirtualSource(a, dtype=int)
Ejemplo n.º 5
0
def virtual_sources(files: List[Path], meta_file: h5py.File) -> VirtualSourceInfo:
    """
    Create HDF5 virtual sources and collate ancillary information from raw data files.

    Args:
        files:      Lexicographically sorted list of raw file paths.
        meta_file:  Tristan detector metadata file object.

    Returns:
        - Dictionary of event data set names and iterators of corresponding HDF5 virtual
          sources.  The iterator of sources for each data set is based on
          itertools.cycle and so repeats indefinitely in the order in which successive
          event slices should be selected to build the virtual data set.
        - Dictionary of cue data set names and lists of corresponding HDF5 virtual
          sources.  The lists of sources have length and order as per the list of input
          files.
        - List of the number of cues in each data file after zero-padding has been
          stripped.  Length and order as per the list of input files.
        - Dictionary of data set names and corresponding data types.
    """
    event_sources = {key: [] for key in event_keys}
    cue_sources = {key: [] for key in cue_keys}
    num_cues_per_file = []

    with ExitStack() as stack:
        raw_files = [stack.enter_context(h5py.File(path)) for path in files]

        dtypes = {key: raw_files[0][key].dtype for key in event_keys + cue_keys}

        for raw_file in raw_files:
            # The cues are padded with zeroes.  Find the first so we can slice them off.
            num_cues_per_file.append(np.argmax(raw_file["cue_id"][()] == 0))
            for key in event_keys:
                event_sources[key].append(h5py.VirtualSource(raw_file[key]))
            for key in cue_keys:
                cue_sources[key].append(h5py.VirtualSource(raw_file[key]))

    # Make a list of slices with which to divide the lexicographically sorted list of
    # file paths into sub-lists, each slice corresponding to a different detector
    # module.  Ordered by module number.  Length is equal to the number of modules in
    # the detector.
    file_slices = np.pad(np.cumsum(meta_file["fp_per_module"]), (1, 0))
    file_slices = list(map(slice, file_slices[:-1], file_slices[1:]))

    # Construct a carousel to select time slices in the order in which they should
    # appear in the virtual layout.
    for key, sources in event_sources.items():
        carousel = zip(*(cycle(sources[file_slice]) for file_slice in file_slices))
        event_sources[key] = chain.from_iterable(carousel)

    return event_sources, cue_sources, num_cues_per_file, dtypes
Ejemplo n.º 6
0
    def __init__(self, filenames, default_streams=None):
        super(H5DatasetLoader, self).__init__()
        self.filenames = filenames
        if isinstance(self.filenames, list):
            self._h5_tempfile = tempfile.NamedTemporaryFile()
            #self.h5_file = h5py.File(self._h5_tempfile, 'w', libver='latest')

            self._allfiles, _allstreams, _lengths = zip(*[H5DatasetLoader.load_single_h5(f) for f in self.filenames])

            total_len = sum(_lengths)

            #create virtual datasets of, assumes that all files have the streams of first file and shape of first file
            ll = (0,) + _lengths
            ll = np.cumsum(ll)
            for s in _allstreams[0]:
                shape = (total_len, ) + self._allfiles[0][s].shape[1:]
                layout = h5py.VirtualLayout(shape=shape, dtype=self._allfiles[0][s].dtype)

                for idx, f in enumerate(self._allfiles):
                    vsource = h5py.VirtualSource(f[s])
                    layout[ll[idx]:ll[idx+1]] = vsource

                with h5py.File(self._h5_tempfile.name, 'a', libver='latest') as f:
                    f.create_virtual_dataset(s, layout,)
            self._h5_tempfile.flush()
            self.h5_file = H5DatasetLoader.load_single_h5(self._h5_tempfile.name)[0]
        else:
            self.h5_file = H5DatasetLoader.load_single_h5(self.filenames)[0]
        self.streams_available = list(self.h5_file.keys())
        self.default_streams = default_streams

        if default_streams is not None:
            for s in default_streams:
                assert s in self.streams_available, f"{s} not found in available streams"
Ejemplo n.º 7
0
 def make_linked_stack(self, fullname):
     """
     Actually makes the stacked dataset. This is a separate method since h5py's visit 
     items does not follow external links.
     
     fullname
         string key to the dataset to be converted into a stacked VDS
     """
     datashape = h5.File(self.source_path_pattern %
                         (self.file_numbers[0]))[fullname].shape
     outshape = (len(self.file_numbers), ) + datashape
     TGT = h5.VirtualTarget(self.target_path, fullname, shape=outshape)
     k = 0
     VMlist = []
     for fnum in self.file_numbers:
         print fnum
         source_path = self.source_path_pattern % (fnum)
         VSRC = h5.VirtualSource(source_path, fullname, shape=datashape)
         VM = h5.VirtualMap(VSRC, TGT[k:(k + 1):1], dtype=np.float)
         VMlist.append(VM)
         k += 1
     d = self.outfile.create_virtual_dataset(VMlist=VMlist, fillvalue=0)
     for key, val in h5.File(
             self.source_path_pattern %
         (self.file_numbers[0]))[fullname].attrs.iteritems():
         self.outfile[fullname].attrs[key] = val
Ejemplo n.º 8
0
def writesino(h5name, omegas, dtys, filenames):
    offset, size, shape, dtype = binary_info( filenames[0][0] )
    print(offset,size,shape,dtype)
    nframes = len( omegas[0] ) * len( omegas )
    print(nframes, len(omegas), sum(len(o) for o in omegas))
    # Now create a hdf5 file:
    with h5py.File(h5name, "w", libver='latest' ) as h:
        # now create a VDS linking within the same file
        layout = h5py.VirtualLayout( shape = (nframes, shape[0], shape[1] ),
                                     dtype = dtype )
        j = 0
        graw = h.require_group('scans')
        for i, scan in enumerate(filenames):
            g = graw.require_group('scan%04d'%(i))
            g.create_dataset( "data",
                              shape = (len(scan), shape[0], shape[1]),
                              dtype = dtype,
                              external = [(fname, offset, size) for fname in scan] )
            g.create_dataset( "omega" , data = omegas[i] )
            g.create_dataset( "dty" , data = dtys[i] )
            vsource = h5py.VirtualSource( h.filename, # ok - circular?
                                          'scans/scan%04d/data'%(i),
                                          shape = (len(scan), shape[0], shape[1]) )
            layout[ j:j+len(scan), :, :] = vsource
            j += len(scan)
        g = h.require_group('sinogram')
        g.create_dataset('omega', data = np.concatenate(omegas) )
        g.create_dataset('dty', data = np.concatenate(dtys) )
        g.create_virtual_dataset( 'data', layout )
Ejemplo n.º 9
0
    def _assemble_data(self, source, key):
        """Assemble chunks of data into a virtual layout"""
        # First, get a list of all non-empty data chunks
        chunks = [
            c for c in self.data._find_data_chunks(source, key)
            if (c.counts > 0).any()
        ]
        chunks.sort(key=lambda c: c.train_ids[0])
        if not chunks:
            return None, None

        # Create the layout, which will describe what data is where
        n_total = np.sum([c.counts.sum() for c in chunks])
        ds0 = chunks[0].dataset
        layout = h5py.VirtualLayout(shape=(n_total, ) + ds0.shape[1:],
                                    dtype=ds0.dtype)

        # Map each chunk into the relevant part of the layout
        output_cursor = np.uint64(0)
        for chunk in chunks:
            n = chunk.counts.sum()
            src = h5py.VirtualSource(chunk.dataset)
            src = src[chunk.slice]
            layout[output_cursor:output_cursor + n] = src
            output_cursor += n

        assert output_cursor == n_total

        # Make an array of which train ID each data entry is for:
        train_ids = np.concatenate(
            [np.repeat(c.train_ids, c.counts.astype(np.intp)) for c in chunks])
        return layout, train_ids
Ejemplo n.º 10
0
    def test_mismatched_selections(self):
        layout = h5.VirtualLayout((4, 100), 'i4', maxshape=(4, None))

        filename = osp.join(self.tmpdir, "1.h5")
        vsource = h5.VirtualSource(filename, 'data', shape=(100, ))
        with self.assertRaisesRegex(ValueError, r'different number'):
            layout[0, :49] = vsource[0:100:2]
Ejemplo n.º 11
0
    def test_eiger_high_level(self):
        self.outfile = self.working_dir + 'eiger.h5'
        TGT = h5.VirtualTarget(self.outfile, 'data', shape=(78, 200, 200))
        VMlist = []
        M_minus_1 = 0
        # Create the virtual dataset file
        with h5.File(self.outfile, 'w', libver='latest') as f:
            for foo in self.fname:
                in_data = h5.File(foo)['data']
                src_shape = in_data.shape
                in_data.file.close()
                M = M_minus_1 + src_shape[0]
                VSRC = h5.VirtualSource(foo, 'data', shape=src_shape)
                VM = h5.VirtualMap(VSRC,
                                   TGT[M_minus_1:M, :, :],
                                   dtype=np.float)
                VMlist.append(VM)
                M_minus_1 = M
            d = f.create_virtual_dataset(VMlist=VMlist, fillvalue=45)
            f.close()

        f = h5.File(self.outfile, 'r')['data']
        self.assertEqual(f[10, 100, 10], 0.0)
        self.assertEqual(f[30, 100, 100], 1.0)
        self.assertEqual(f[50, 100, 100], 2.0)
        self.assertEqual(f[70, 100, 100], 3.0)
        f.file.close()
Ejemplo n.º 12
0
    def test_percival_high_level(self):
        self.outfile = self.working_dir + 'percival.h5'
        VM = []
        # Create the virtual dataset file
        with h5.File(self.outfile, 'w', libver='latest') as f:
            TGT = h5.VirtualTarget(
                self.outfile,
                'data',
                shape=(79, 200, 200),
                maxshape=(None, 200, 200)
            )  # Virtual target is a representation of the output dataset
            k = 0
            for foo in self.fname:
                VSRC = h5.VirtualSource(foo,
                                        'data',
                                        shape=(20, 200, 200),
                                        maxshape=(None, 200, 200))
                VM.append(
                    h5.VirtualMap(VSRC, TGT[k:79:4, :, :], dtype=np.float))
                k += 1
            f.create_virtual_dataset(
                VMlist=VM,
                fillvalue=-5)  # pass the fill value and list of maps
            f.close()

        f = h5.File(self.outfile, 'r')['data']
        sh = f.shape
        line = f[:8, 100, 100]
        foo = np.array(2 * range(4))
        f.file.close()
        self.assertEqual(
            sh,
            (79, 200, 200),
        )
        np.testing.assert_array_equal(line, foo)
Ejemplo n.º 13
0
    def test_percival_high_level(self):
        outfile = osp.join(self.working_dir, 'percival.h5')

        # Virtual layout is a representation of the output dataset
        layout = h5.VirtualLayout(shape=(79, 200, 200), dtype=np.float)
        for k, filename in enumerate(self.fname):
            dim1 = 19 if k == 3 else 20
            vsource = h5.VirtualSource(filename,
                                       'data',
                                       shape=(dim1, 200, 200))
            layout[k:79:4, :, :] = vsource[:, :, :]

        # Create the virtual dataset file
        with h5.File(outfile, 'w', libver='latest') as f:
            f.create_virtual_dataset('data', layout, fillvalue=-5)

        foo = np.array(2 * list(range(4)))
        with h5.File(outfile, 'r') as f:
            ds = f['data']
            line = ds[:8, 100, 100]
            self.assertEqual(
                ds.shape,
                (79, 200, 200),
            )
            assert_array_equal(line, foo)
Ejemplo n.º 14
0
def create_virtual_data(file_pattern, x, entry_key, save_to):
    files = [file_pattern % el for el in x]
    files = [(el, f) for f, el in zip(files, x) if os.path.exists(f)]
    # entry_key = '/ref/power/008'
    # save_to = "/Users/beauchamplab/rave_data/data_dir/congruency/YAB/rave/data/power/virtual.h5"

    if len(files) == 0:
        print('No valid files found')
        return False

    # get file shape
    with h5py.File(files[0][1], 'r') as sample_f:
        sh = sample_f[entry_key].shape
        dtype = sample_f[entry_key].dtype

    layout = h5py.VirtualLayout(shape=(len(files), ) + sh, dtype=dtype)
    for i, file_dup in enumerate(files):
        filename = file_dup[1]
        el = file_dup[0]
        print(filename)
        vsource = h5py.VirtualSource(filename, entry_key, shape=sh)
        layout[i, :, :] = vsource

    with h5py.File(save_to, 'w', libver='latest') as f:
        f.create_virtual_dataset(entry_key, layout, fillvalue=np.nan)

    return True
Ejemplo n.º 15
0
    def _map_layouts(self, layouts):
        """
        Map virtual sources into virtual layouts.

        Parameters
        ----------
        layouts: dict
          A dictionary of unmapped virtual layouts.

        Returns
        -------
        layouts: dict
          A dictionary of virtual layouts mapped to the virtual sources.
        """
        for name, layout in layouts.items():
            key = '{}.{}'.format(self.group_label, name)
            have_data = np.zeros((self.nframes, self.nmodules), dtype=bool)

            for source, modno in self.detdata.source_to_modno.items():
                print(f" ### Source: {source}, ModNo: {modno}, Key: {key}")
                module_ix = self._get_module_index(modno)
                for chunk in self.data._find_data_chunks(source, key):
                    vsrc = h5py.VirtualSource(chunk.dataset)
                    self._map_chunk(chunk, vsrc, layout, module_ix, have_data)

            filled_pct = 100 * have_data.sum() / have_data.size
            if hasattr(layout, 'sources'):
                n_mappings = len(layout.sources)  # h5py < 3.3
            else:
                n_mappings = layout.dcpl.get_virtual_count()  # h5py >= 3.3
            log.info(f"Assembled {n_mappings:d} chunks for {key:s}, "
                     f"filling {filled_pct:.2f}% of the hyperslab")

        return layouts
Ejemplo n.º 16
0
def merge(output, h5s):
    try:
        dfs = [h5py.File(h5, "r") for h5 in h5s]

        im_key = list(dfs[0].keys())[0] + "/images"
        im_shape = dfs[0][im_key].shape[1:]
        merged_shape = [0] + list(im_shape)
        for df in dfs:
            assert df[im_key].shape[
                1:] == im_shape, "Image shape in %s (%s) does not equal %s" % (
                    df.filename, str(df[im_key].shape[1:]), str(im_shape))
            merged_shape[0] += df[im_key].shape[0]

        merged_shape = tuple(merged_shape)

        with h5py.File(output, "w") as merged_df:
            for changrp in dfs[0].keys():
                mergedgrp = merged_df.create_group(changrp)
                for key in dfs[0][changrp].keys():
                    layout = h5py.VirtualLayout(
                        shape=merged_shape, dtype=dfs[0][changrp][key].dtype)
                    vsources = []
                    i = 0
                    for df in dfs:
                        vsources.append(
                            h5py.VirtualSource(df[changrp + "/" + key]))
                        layout[i:i + vsources[-1].shape[0]] = vsources[-1]

                        i += vsources[-1].shape[0]

                    mergedgrp.create_virtual_dataset(key, layout)

    finally:
        for df in dfs:
            df.close()
Ejemplo n.º 17
0
def joinVDS(infilenames, outfilename):
    """
    creates a new h5py with virtual datasets of all datasets in infilenames[0] with ndim>1
    concatenates these datasets from all files in infilenames
    """
    layouts = {}

    def createlayout(name, obj):
        if isinstance(obj, h5py.Dataset) and len(obj.shape) > 1:
            layouts[name] = h5py.VirtualLayout(shape=(0, *obj.shape[1:]),
                                               maxshape=(None, *obj.shape[1:]),
                                               dtype=obj.dtype)

    with h5py.File(infilenames[0], "r") as firstfile:
        firstfile.visititems(
            createlayout)  # instead of enumerating the file to visit subgroups

    for filename in infilenames:
        with h5py.File(filename, "r") as currentfile:
            for key, layout in layouts.items():
                vsource = h5py.VirtualSource(currentfile[key])
                layout.shape = (layout.shape[0] + vsource.shape[0],
                                *layout.shape[1:])
                layout[-vsource.shape[0]:, ...] = vsource[:]
    with h5py.File(outfilename, "w", libver="latest") as outfile:
        for key, layout in layouts.items():
            outfile.create_virtual_dataset(key, layout, fillvalue=None)
Ejemplo n.º 18
0
def save_epix(out_file, descriptor, trains, shape, epix_id):
    """
    Save EPIX data to a VDS HDF5 file

    out_file - HDF5 file
    descriptor - list of data files to save
    trains - train IDs to save
    shape - EPIX data shape
    epix_id - EPIX detector number
    """
    layout = h5py.VirtualLayout(shape=(trains.size, ) + shape, dtype=np.uint16)
    counter = 0
    for file_name in descriptor:
        print('Opening file: {}'.format(os.path.basename(file_name)))
        with h5py.File(file_name, 'r') as data_file:
            file_trains = data_file[config.EPIX_TRAIN_KEY][:]
            file_data = data_file[config.EPIX_KEY.format(epix_id)]
            file_idxs = np.concatenate(
                [np.where(train_id == file_trains)[0] for train_id in trains])
            chunk_size = file_data.chunks[0]
            num_chunks = int(np.ceil(file_idxs.size / chunk_size))
            for chunk in range(num_chunks):
                start, end = chunk * chunk_size, min(file_data.shape[0],
                                                     (chunk + 1) * chunk_size)
                data = h5py.VirtualSource(file_data)[
                    file_idxs[start:end], :, :]
                layout[counter:counter + file_idxs[start:end].size] = data
                counter += file_idxs[start:end].size
        print('File {0} saved, data size: {1:d}\n'.format(
            os.path.basename(file_name), counter))
    out_file.create_virtual_dataset(config.EPIX_DATA_KEY.format(epix_id),
                                    layout)
Ejemplo n.º 19
0
def h5_virtual_file(filenames, name="data"):
    """
    Assembles a virtual h5 file from multiples
    """
    vsources = []
    total_t = 0
    for path in filenames:
        data = h5py.File(path, "r").get(name)
        t, *features_shape = data.shape
        total_t += t
        vsources.append(h5py.VirtualSource(path, name, shape=(t, *features_shape)))

    # Assemble virtual dataset
    layout = h5py.VirtualLayout(shape=(total_t, *features_shape), dtype=data.dtype)
    cursor = 0
    for vsource in vsources:
        # we generate slices like layour[0:10,:,:,:]
        indices = (slice(cursor, cursor + vsource.shape[0]),) + (slice(None),) * (
            len(vsource.shape) - 1
        )
        layout[indices] = vsource
        cursor += vsource.shape[0]
    # Add virtual dataset to output file
    f = h5py.File(f"{uuid.uuid4()}.h5", "w", libver="latest")
    f.create_virtual_dataset(name, layout)
    return f
Ejemplo n.º 20
0
 def test_shape_calculation_positive_step(self):
     dataset = h5.VirtualSource('test', 'test', (20, ))
     cmp = []
     for i in range(5):
         d = dataset[2:12 + i:3].shape[0]
         ref = np.arange(20)[2:12 + i:3].size
         cmp.append(ref == d)
     self.assertEqual(5, sum(cmp))
Ejemplo n.º 21
0
 def test_double_strided_range(self):
     dataset = h5.VirtualSource('test', 'test', (20, 30, 30))
     sliced = dataset[6:12:2, :, 20:26:3]
     self.assertEqual((
         3,
         30,
         2,
     ), sliced.shape)
Ejemplo n.º 22
0
 def test_shape_calculation_positive_step_switched_start_stop(self):
     dataset = h5.VirtualSource('test', 'test', (20, ))
     cmp = []
     for i in range(5):
         d = dataset[12 + i:2:3].shape[0]
         ref = np.arange(20)[12 + i:2:3].size
         print d, ref
         cmp.append(ref == d)
     self.assertEqual(5, sum(cmp))
def split(input_h5, output_h5):
    """Read the data file, create N_FAST * N_SLOW new data sets, then
    copy the data from the former into the latter and build a VDS"""

    with h5py.File(input_h5, "r") as fin:
        frames, slow, fast = fin["data"].shape

        output_files = []
        output_dsets = []
        for n in range(len(CHUNKMAP)):
            filename = output_h5.replace(".h5", "_%02d.h5" % n)
            fout = h5py.File(filename, "x")

            # in here I am chunking as 4-module chunks but _maybe_ we should
            # consider chunking as 1-module chunks and having 4 chunks per
            # "image" -> :thinking_face:

            dset = fout.create_dataset(
                "data",
                (frames, 4 * MOD_SLOW, MOD_FAST),
                chunks=(1, 4 * MOD_SLOW, MOD_FAST),
                compression=bitshuffle.h5.H5FILTER,
                compression_opts=(0, bitshuffle.h5.H5_COMPRESS_LZ4),
                dtype=fin["data"].dtype,
            )

            output_files.append((fout, filename))
            output_dsets.append(dset)

        blit(fin["data"], output_dsets)

        for fout in output_files:
            fout[0].close()

        # create VDS
        layout = h5py.VirtualLayout(shape=(frames, slow, fast), dtype="i4")

        for i, chunk in enumerate(CHUNKMAP):
            source = h5py.VirtualSource(output_files[i][1],
                                        "data",
                                        shape=(frames, 4 * MOD_SLOW, MOD_FAST))
            for k, n in enumerate(chunk):
                s, f = divmod(n, N_FAST)
                f0 = f * (MOD_FAST + GAP_FAST)
                f1 = f0 + MOD_FAST
                s0 = s * (MOD_SLOW + GAP_SLOW)
                s1 = s0 + MOD_SLOW
                layout[:, s0:s1,
                       f0:f1] = source[:, k * MOD_SLOW:(k + 1) * MOD_SLOW, :]

        fout = h5py.File(output_h5, "x")
        data = fout.create_virtual_dataset("data", layout, fillvalue=-1)
        for k in "image_nr_low", "image_nr_high":
            data.attrs.create(k, fin["data"].attrs.get(k), dtype="i4")
Ejemplo n.º 24
0
    def preallocate_output(self, out, parallel_store=False):
        """
        Storage allocation and provisioning

        Parameters
        ----------
        out : syncopy data object
           Empty object for holding results
        parallel_store : bool
           If `True`, a directory for virtual source files is created
           in Syncopy's temporary on-disk storage (defined by `syncopy.__storage__`).
           Otherwise, a dataset of appropriate type and shape is allocated
           in a new regular HDF5 file created inside Syncopy's temporary
           storage folder.

        Returns
        -------
        Nothing : None

        See also
        --------
        compute : management routine controlling memory pre-allocation
        """

        # In case parallel writing via VDS storage is requested, prepare
        # directory for by-chunk HDF5 files and construct virtual HDF layout
        if parallel_store:
            vdsdir = os.path.splitext(os.path.basename(out.filename))[0]
            self.virtualDatasetDir = os.path.join(__storage__, vdsdir)
            os.mkdir(self.virtualDatasetDir)

            layout = h5py.VirtualLayout(shape=self.outputShape, dtype=self.dtype)
            for k, idx in enumerate(self.targetLayout):
                fname = os.path.join(self.virtualDatasetDir, "{0:d}.h5".format(k))
                # Catch empty selections: don't map empty sources into the layout of the VDS
                if all([sel for sel in self.sourceLayout[k]]):
                    layout[idx] = h5py.VirtualSource(fname, self.virtualDatasetNames, shape=self.targetShapes[k])
            self.VirtualDatasetLayout = layout
            self.outFileName = os.path.join(self.virtualDatasetDir, "{0:d}.h5")
            self.tmpDsetName = self.virtualDatasetNames

        # Create regular HDF5 dataset for sequential writing
        else:

            # The shape of the target depends on trial-averaging
            if not self.keeptrials:
                shp = self.cfg["chunkShape"]
            else:
                shp = self.outputShape
            with h5py.File(out.filename, mode="w") as h5f:
                h5f.create_dataset(name=self.outDatasetName,
                                   dtype=self.dtype, shape=shp)
            self.outFileName = out.filename
            self.tmpDsetName = self.outDatasetName
Ejemplo n.º 25
0
def concatenate(file_names_to_concatenate):
    entry_key = 'data'  # where the data is inside of the source files.
    sh = h5py.File(file_names_to_concatenate[0],
                   'r')[entry_key].shape  # get the first ones shape.
    layout = h5py.VirtualLayout(shape=(len(file_names_to_concatenate), ) + sh,
                                dtype=np.float64)
    with h5py.File("VDS.h5", 'w', libver='latest') as f:
        for i, filename in enumerate(file_names_to_concatenate):
            vsource = h5py.VirtualSource(filename, entry_key, shape=sh)
            layout[i, :, :, :] = vsource

        f.create_virtual_dataset(entry_key, layout, fillvalue=0)
Ejemplo n.º 26
0
def tile_h5datasets(dest, name, sources, shape_map, tile_shape, nscandim=1):
    """Merge datasets in a virtual dataset.

    :param h5py.Group dest:
    :param str name:
    :param list(h5py.Dataset) sources:
    :param dict shape_map:
    :param int nscandim: start index of the data dimensions
    """
    dset_shapes = [dset.shape for dset in sources]
    scan_shapes = [dset_shape[:nscandim]
                   for dset_shape in dset_shapes]  # F-order
    det_shapes = [dset_shape[nscandim:] for dset_shape in dset_shapes]

    reshaped_scan_shapes = [
        shape_map.get(scan_shape, scan_shape) for scan_shape in scan_shapes
    ]  # F-order
    reshaped_scan_shapes = [s[::-1] for s in reshaped_scan_shapes]  # C-order

    reduced_scan_shapes, reshaped_scan_shapes = zip(
        *(match_shapes([shape1, shape2[::-1]])
          for shape1, shape2 in zip(scan_shapes, reshaped_scan_shapes)))
    reshaped_scan_shapes = [s[::-1] for s in reshaped_scan_shapes]  # C-order
    tile_shape = tile_shape[::-1]  # C-order

    layout_scan_shape, indices = tile_indices(tile_shape,
                                              reshaped_scan_shapes,
                                              order="C")

    layout_shape = layout_scan_shape + max_shape(det_shapes)

    dtype = sources[0].dtype
    fillvalue = sources[0].fillvalue
    layout = h5py.VirtualLayout(shape=layout_shape, dtype=dtype)
    for layout_idx, dset, reduced_scan_shape, det_shape in zip(
            indices, sources, reduced_scan_shapes, det_shapes):
        vsource = h5py.VirtualSource(
            dset.file.filename,
            dset.name,
            shape=dset.shape,
            dtype=dset.dtype,
        )
        reduced_source_shape = reduced_scan_shape + det_shape
        det_idx = tuple(slice(0, n) for n in det_shape)
        if reduced_source_shape != vsource.shape:
            vsource_idx = tuple(slice(0, n) for n in reduced_source_shape)
            vsource_idx += det_idx
            vsource = vsource[vsource_idx]
        layout_idx += det_idx
        layout[layout_idx] = vsource
    dest.create_virtual_dataset(name, layout, fillvalue=fillvalue)
Ejemplo n.º 27
0
def test_check_file(tmp_path):
    filename = str(tmp_path / 'test.h5')

    noaccess = (tmp_path / 'noaccess.h5')
    noaccess.touch()
    noaccess.chmod(0)

    with h5py.File(filename, 'w') as f:
        f['exists'] = np.arange(10, dtype=np.float32)

        layout = h5py.VirtualLayout((10, 10), np.float32)

        # 0: valid, accessible mapping
        layout[0] = h5py.VirtualSource('test.h5', 'exists', (10, ))
        # 1: file exists, but dataset doesn't
        layout[1] = h5py.VirtualSource('test.h5', 'nonexists', (10, ))
        # 2: file doesn't exist
        layout[2] = h5py.VirtualSource('testnothere.h5', 'nonexists', (10, ))
        # 3: file exists, but don't have read permission
        layout[3] = h5py.VirtualSource('noaccess.h5', 'blah', (10, ))
        f.create_virtual_dataset('vds', layout)

    assert hdf5_vds_check.check_file(filename) == 3  # 3 inaccessible sources
Ejemplo n.º 28
0
    def test_excalibur_high_level(self):
        self.outfile = self.working_dir + 'excalibur.h5'
        f = h5.File(self.outfile, 'w',
                    libver='latest')  # create an output file.
        in_key = 'data'  # where is the data at the input?
        in_sh = h5.File(self.fname[0],
                        'r')[in_key].shape  # get the input shape
        dtype = h5.File(self.fname[0], 'r')[in_key].dtype  # get the datatype
        # now generate the output shape
        vertical_gap = 10  # pixels spacing in the vertical
        nfiles = len(self.fname)
        print "nfiles is:" + str(nfiles)
        nframes = in_sh[0]
        width = in_sh[2]
        height = (in_sh[1] * nfiles) + (vertical_gap * (nfiles - 1))
        out_sh = (nframes, height, width)
        print out_sh, in_sh
        TGT = h5.VirtualTarget(
            self.outfile, 'data', shape=out_sh
        )  # Virtual target is a representation of the output dataset
        offset = 0  # initial offset
        print(offset + in_sh[1]) - offset
        VMlist = []  # place to put the maps
        for i in range(nfiles):
            print("frame_number is: %s, offset is:%s" % (str(i), offset)
                  )  # for feedback
            VSRC = h5.VirtualSource(
                self.fname[i], in_key,
                shape=in_sh)  #a representation of the input dataset
            VM = h5.VirtualMap(VSRC,
                               TGT[:, offset:(offset + in_sh[1]), :],
                               dtype=dtype)  # map them with indexing
            offset += in_sh[1] + vertical_gap  # increment the offset
            VMlist.append(VM)  # append it to the list

        f.create_virtual_dataset(
            VMlist=VMlist,
            fillvalue=0x1)  # pass the fill value and list of maps
        f.close()

        f = h5.File(self.outfile, 'r')['data']
        self.assertEqual(f[3, 100, 0], 0.0)
        self.assertEqual(f[3, 260, 0], 1.0)
        self.assertEqual(f[3, 350, 0], 3.0)
        self.assertEqual(f[3, 650, 0], 6.0)
        self.assertEqual(f[3, 900, 0], 9.0)
        self.assertEqual(f[3, 1150, 0], 12.0)
        self.assertEqual(f[3, 1450, 0], 15.0)
        f.file.close()
Ejemplo n.º 29
0
    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()
        self.f1 = osp.join(self.tmpdir, 'testfile1.h5')
        self.f2 = osp.join(self.tmpdir, 'testfile2.h5')

        self.data1 = np.arange(10)
        self.data2 = np.arange(10) * -1

        with h5.File(self.f1, 'w') as f:
            # dataset
            ds = f.create_dataset('data', (10, ), 'f4')
            ds[:] = self.data1

        with h5.File(self.f2, 'w') as f:
            # dataset
            ds = f.create_dataset('data', (10, ), 'f4')
            ds[:] = self.data2
            # virtual dataset
            layout = h5.VirtualLayout((2, 10), 'f4')
            vsource1 = h5.VirtualSource(self.f1, 'data', shape=(10, ))
            vsource2 = h5.VirtualSource(self.f2, 'data', shape=(10, ))
            layout[0] = vsource1
            layout[1] = vsource2
            f.create_virtual_dataset('virtual', layout)
Ejemplo n.º 30
0
    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()
        self.path = osp.join(self.tmpdir, "resize.h5")
        with h5.File(self.path, "w") as f:
            source_dset = f.create_dataset("source",
                                           data=np.arange(20),
                                           shape=(10, 2),
                                           maxshape=(None, 2),
                                           chunks=(10, 1),
                                           fillvalue=-1)
            self.layout = h5.VirtualLayout((10, 1), np.int, maxshape=(None, 1))
            layout_source = h5.VirtualSource(source_dset)
            self.layout[:h5.UNLIMITED, 0] = layout_source[:h5.UNLIMITED, 1]

            f.create_virtual_dataset("virtual", self.layout)