def virtual_layouts(num_events: int, num_cues: int, dtypes: Dict[str, type]) -> Layouts: """Create a dictionary of data set names and corresponding HDF5 virtual layouts.""" layouts = {} for key in event_keys: layouts[key] = h5py.VirtualLayout(shape=(num_events,), dtype=dtypes[key]) for key in cue_keys: layouts[key] = h5py.VirtualLayout(shape=(num_cues,), dtype=dtypes[key]) return layouts
def combine_h5(h5dir, out_h5file): filelist = list(pathlib.Path(h5dir).glob('*.h5')) unicode = h5py.special_dtype(vlen=str) n_files = len(filelist) # Get total no. of utts (spks) and no. of frames in .h5 files in the folder n_utts = list() n_frames = list() for i in range(n_files): with h5py.File(filelist[i], 'r') as f: n_utts.append(len(f['utt_ids'])) n_frames.append(f['mfcc'].shape[0]) mfcc_dim = f['mfcc'].shape[1] tot_n_utts = np.sum(n_utts) tot_n_frames = np.sum(n_frames) print(f"Total no. of utts = {tot_n_utts}") print(f"Total no. of frames = {tot_n_frames}") print(f"MFCC dim = {mfcc_dim}") # Assemble virtual dataset utt_layout = h5py.VirtualLayout(shape=(tot_n_utts, ), dtype=unicode) spk_layout = h5py.VirtualLayout(shape=(tot_n_utts, ), dtype=unicode) pos_layout = h5py.VirtualLayout(shape=(tot_n_utts, 2), dtype="int64") mfc_layout = h5py.VirtualLayout(shape=(tot_n_frames, mfcc_dim), dtype="float32") k1 = 0 k2 = 0 for i in range(n_files): print(f"Reading {filelist[i]}") range1 = range(k1, k1 + n_utts[i]) range2 = range(k2, k2 + n_frames[i]) print(f"spk_ids: {range1}") print(f"mfcc: {range2}") utt_layout[range1] = h5py.VirtualSource(filelist[i], "utt_ids", shape=(n_utts[i], )) spk_layout[range1] = h5py.VirtualSource(filelist[i], "spk_ids", shape=(n_utts[i], )) pos_layout[range1] = h5py.VirtualSource(filelist[i], "positions", shape=(n_utts[i], 2)) mfc_layout[range2] = h5py.VirtualSource(filelist[i], "mfcc", shape=(n_frames[i], mfcc_dim)) k1 = k1 + n_utts[i] k2 = k2 + n_frames[i] # Add virtual dataset to output file with h5py.File(out_h5file, "w", libver="latest") as f: print(f"Writing combined file {out_h5file}") f.create_virtual_dataset("utt_ids", utt_layout, fillvalue=None) f.create_virtual_dataset("spk_ids", spk_layout, fillvalue=None) f.create_virtual_dataset("positions", pos_layout, fillvalue=None) f.create_virtual_dataset("mfcc", mfc_layout, fillvalue=None)
def test_mismatched_selections(self): layout = h5.VirtualLayout((4, 100), 'i4', maxshape=(4, None)) filename = osp.join(self.tmpdir, "1.h5") vsource = h5.VirtualSource(filename, 'data', shape=(100, )) with self.assertRaisesRegex(ValueError, r'different number'): layout[0, :49] = vsource[0:100:2]
def h5_virtual_file(filenames, name="data"): """ Assembles a virtual h5 file from multiples """ vsources = [] total_t = 0 for path in filenames: data = h5py.File(path, "r").get(name) t, *features_shape = data.shape total_t += t vsources.append(h5py.VirtualSource(path, name, shape=(t, *features_shape))) # Assemble virtual dataset layout = h5py.VirtualLayout(shape=(total_t, *features_shape), dtype=data.dtype) cursor = 0 for vsource in vsources: # we generate slices like layour[0:10,:,:,:] indices = (slice(cursor, cursor + vsource.shape[0]),) + (slice(None),) * ( len(vsource.shape) - 1 ) layout[indices] = vsource cursor += vsource.shape[0] # Add virtual dataset to output file f = h5py.File(f"{uuid.uuid4()}.h5", "w", libver="latest") f.create_virtual_dataset(name, layout) return f
def merge(output, h5s): try: dfs = [h5py.File(h5, "r") for h5 in h5s] im_key = list(dfs[0].keys())[0] + "/images" im_shape = dfs[0][im_key].shape[1:] merged_shape = [0] + list(im_shape) for df in dfs: assert df[im_key].shape[ 1:] == im_shape, "Image shape in %s (%s) does not equal %s" % ( df.filename, str(df[im_key].shape[1:]), str(im_shape)) merged_shape[0] += df[im_key].shape[0] merged_shape = tuple(merged_shape) with h5py.File(output, "w") as merged_df: for changrp in dfs[0].keys(): mergedgrp = merged_df.create_group(changrp) for key in dfs[0][changrp].keys(): layout = h5py.VirtualLayout( shape=merged_shape, dtype=dfs[0][changrp][key].dtype) vsources = [] i = 0 for df in dfs: vsources.append( h5py.VirtualSource(df[changrp + "/" + key])) layout[i:i + vsources[-1].shape[0]] = vsources[-1] i += vsources[-1].shape[0] mergedgrp.create_virtual_dataset(key, layout) finally: for df in dfs: df.close()
def _assemble_data(self, source, key): """Assemble chunks of data into a virtual layout""" # First, get a list of all non-empty data chunks chunks = [ c for c in self.data._find_data_chunks(source, key) if (c.counts > 0).any() ] chunks.sort(key=lambda c: c.train_ids[0]) if not chunks: return None, None # Create the layout, which will describe what data is where n_total = np.sum([c.counts.sum() for c in chunks]) ds0 = chunks[0].dataset layout = h5py.VirtualLayout(shape=(n_total, ) + ds0.shape[1:], dtype=ds0.dtype) # Map each chunk into the relevant part of the layout output_cursor = np.uint64(0) for chunk in chunks: n = chunk.counts.sum() src = h5py.VirtualSource(chunk.dataset) src = src[chunk.slice] layout[output_cursor:output_cursor + n] = src output_cursor += n assert output_cursor == n_total # Make an array of which train ID each data entry is for: train_ids = np.concatenate( [np.repeat(c.train_ids, c.counts.astype(np.intp)) for c in chunks]) return layout, train_ids
def create_virtual_data(file_pattern, x, entry_key, save_to): files = [file_pattern % el for el in x] files = [(el, f) for f, el in zip(files, x) if os.path.exists(f)] # entry_key = '/ref/power/008' # save_to = "/Users/beauchamplab/rave_data/data_dir/congruency/YAB/rave/data/power/virtual.h5" if len(files) == 0: print('No valid files found') return False # get file shape with h5py.File(files[0][1], 'r') as sample_f: sh = sample_f[entry_key].shape dtype = sample_f[entry_key].dtype layout = h5py.VirtualLayout(shape=(len(files), ) + sh, dtype=dtype) for i, file_dup in enumerate(files): filename = file_dup[1] el = file_dup[0] print(filename) vsource = h5py.VirtualSource(filename, entry_key, shape=sh) layout[i, :, :] = vsource with h5py.File(save_to, 'w', libver='latest') as f: f.create_virtual_dataset(entry_key, layout, fillvalue=np.nan) return True
def writesino(h5name, omegas, dtys, filenames): offset, size, shape, dtype = binary_info( filenames[0][0] ) print(offset,size,shape,dtype) nframes = len( omegas[0] ) * len( omegas ) print(nframes, len(omegas), sum(len(o) for o in omegas)) # Now create a hdf5 file: with h5py.File(h5name, "w", libver='latest' ) as h: # now create a VDS linking within the same file layout = h5py.VirtualLayout( shape = (nframes, shape[0], shape[1] ), dtype = dtype ) j = 0 graw = h.require_group('scans') for i, scan in enumerate(filenames): g = graw.require_group('scan%04d'%(i)) g.create_dataset( "data", shape = (len(scan), shape[0], shape[1]), dtype = dtype, external = [(fname, offset, size) for fname in scan] ) g.create_dataset( "omega" , data = omegas[i] ) g.create_dataset( "dty" , data = dtys[i] ) vsource = h5py.VirtualSource( h.filename, # ok - circular? 'scans/scan%04d/data'%(i), shape = (len(scan), shape[0], shape[1]) ) layout[ j:j+len(scan), :, :] = vsource j += len(scan) g = h.require_group('sinogram') g.create_dataset('omega', data = np.concatenate(omegas) ) g.create_dataset('dty', data = np.concatenate(dtys) ) g.create_virtual_dataset( 'data', layout )
def save_epix(out_file, descriptor, trains, shape, epix_id): """ Save EPIX data to a VDS HDF5 file out_file - HDF5 file descriptor - list of data files to save trains - train IDs to save shape - EPIX data shape epix_id - EPIX detector number """ layout = h5py.VirtualLayout(shape=(trains.size, ) + shape, dtype=np.uint16) counter = 0 for file_name in descriptor: print('Opening file: {}'.format(os.path.basename(file_name))) with h5py.File(file_name, 'r') as data_file: file_trains = data_file[config.EPIX_TRAIN_KEY][:] file_data = data_file[config.EPIX_KEY.format(epix_id)] file_idxs = np.concatenate( [np.where(train_id == file_trains)[0] for train_id in trains]) chunk_size = file_data.chunks[0] num_chunks = int(np.ceil(file_idxs.size / chunk_size)) for chunk in range(num_chunks): start, end = chunk * chunk_size, min(file_data.shape[0], (chunk + 1) * chunk_size) data = h5py.VirtualSource(file_data)[ file_idxs[start:end], :, :] layout[counter:counter + file_idxs[start:end].size] = data counter += file_idxs[start:end].size print('File {0} saved, data size: {1:d}\n'.format( os.path.basename(file_name), counter)) out_file.create_virtual_dataset(config.EPIX_DATA_KEY.format(epix_id), layout)
def __init__(self, filenames, default_streams=None): super(H5DatasetLoader, self).__init__() self.filenames = filenames if isinstance(self.filenames, list): self._h5_tempfile = tempfile.NamedTemporaryFile() #self.h5_file = h5py.File(self._h5_tempfile, 'w', libver='latest') self._allfiles, _allstreams, _lengths = zip(*[H5DatasetLoader.load_single_h5(f) for f in self.filenames]) total_len = sum(_lengths) #create virtual datasets of, assumes that all files have the streams of first file and shape of first file ll = (0,) + _lengths ll = np.cumsum(ll) for s in _allstreams[0]: shape = (total_len, ) + self._allfiles[0][s].shape[1:] layout = h5py.VirtualLayout(shape=shape, dtype=self._allfiles[0][s].dtype) for idx, f in enumerate(self._allfiles): vsource = h5py.VirtualSource(f[s]) layout[ll[idx]:ll[idx+1]] = vsource with h5py.File(self._h5_tempfile.name, 'a', libver='latest') as f: f.create_virtual_dataset(s, layout,) self._h5_tempfile.flush() self.h5_file = H5DatasetLoader.load_single_h5(self._h5_tempfile.name)[0] else: self.h5_file = H5DatasetLoader.load_single_h5(self.filenames)[0] self.streams_available = list(self.h5_file.keys()) self.default_streams = default_streams if default_streams is not None: for s in default_streams: assert s in self.streams_available, f"{s} not found in available streams"
def test_percival_high_level(self): outfile = osp.join(self.working_dir, 'percival.h5') # Virtual layout is a representation of the output dataset layout = h5.VirtualLayout(shape=(79, 200, 200), dtype=np.float) for k, filename in enumerate(self.fname): dim1 = 19 if k == 3 else 20 vsource = h5.VirtualSource(filename, 'data', shape=(dim1, 200, 200)) layout[k:79:4, :, :] = vsource[:, :, :] # Create the virtual dataset file with h5.File(outfile, 'w', libver='latest') as f: f.create_virtual_dataset('data', layout, fillvalue=-5) foo = np.array(2 * list(range(4))) with h5.File(outfile, 'r') as f: ds = f['data'] line = ds[:8, 100, 100] self.assertEqual( ds.shape, (79, 200, 200), ) assert_array_equal(line, foo)
def make_vds(self, f): # virtual dataset layout = h5.VirtualLayout((2, 10), 'f4') vsource1 = h5.VirtualSource(self.f1, 'data', shape=(10, )) vsource2 = h5.VirtualSource(self.f2, 'data', shape=(10, )) layout[0] = vsource1 layout[1] = vsource2 f.create_virtual_dataset('virtual', layout)
def build_virtual_layout(sources, source_shapes, dtype): virtual_layout = h5py.VirtualLayout(shape=source_shapes, dtype=dtype) offset = 0 for source in sources: length = source.shape[0] virtual_layout[offset:offset + length] = source offset += length return virtual_layout
def split(input_h5, output_h5): """Read the data file, create N_FAST * N_SLOW new data sets, then copy the data from the former into the latter and build a VDS""" with h5py.File(input_h5, "r") as fin: frames, slow, fast = fin["data"].shape output_files = [] output_dsets = [] for n in range(len(CHUNKMAP)): filename = output_h5.replace(".h5", "_%02d.h5" % n) fout = h5py.File(filename, "x") # in here I am chunking as 4-module chunks but _maybe_ we should # consider chunking as 1-module chunks and having 4 chunks per # "image" -> :thinking_face: dset = fout.create_dataset( "data", (frames, 4 * MOD_SLOW, MOD_FAST), chunks=(1, 4 * MOD_SLOW, MOD_FAST), compression=bitshuffle.h5.H5FILTER, compression_opts=(0, bitshuffle.h5.H5_COMPRESS_LZ4), dtype=fin["data"].dtype, ) output_files.append((fout, filename)) output_dsets.append(dset) blit(fin["data"], output_dsets) for fout in output_files: fout[0].close() # create VDS layout = h5py.VirtualLayout(shape=(frames, slow, fast), dtype="i4") for i, chunk in enumerate(CHUNKMAP): source = h5py.VirtualSource(output_files[i][1], "data", shape=(frames, 4 * MOD_SLOW, MOD_FAST)) for k, n in enumerate(chunk): s, f = divmod(n, N_FAST) f0 = f * (MOD_FAST + GAP_FAST) f1 = f0 + MOD_FAST s0 = s * (MOD_SLOW + GAP_SLOW) s1 = s0 + MOD_SLOW layout[:, s0:s1, f0:f1] = source[:, k * MOD_SLOW:(k + 1) * MOD_SLOW, :] fout = h5py.File(output_h5, "x") data = fout.create_virtual_dataset("data", layout, fillvalue=-1) for k in "image_nr_low", "image_nr_high": data.attrs.create(k, fin["data"].attrs.get(k), dtype="i4")
def preallocate_output(self, out, parallel_store=False): """ Storage allocation and provisioning Parameters ---------- out : syncopy data object Empty object for holding results parallel_store : bool If `True`, a directory for virtual source files is created in Syncopy's temporary on-disk storage (defined by `syncopy.__storage__`). Otherwise, a dataset of appropriate type and shape is allocated in a new regular HDF5 file created inside Syncopy's temporary storage folder. Returns ------- Nothing : None See also -------- compute : management routine controlling memory pre-allocation """ # In case parallel writing via VDS storage is requested, prepare # directory for by-chunk HDF5 files and construct virtual HDF layout if parallel_store: vdsdir = os.path.splitext(os.path.basename(out.filename))[0] self.virtualDatasetDir = os.path.join(__storage__, vdsdir) os.mkdir(self.virtualDatasetDir) layout = h5py.VirtualLayout(shape=self.outputShape, dtype=self.dtype) for k, idx in enumerate(self.targetLayout): fname = os.path.join(self.virtualDatasetDir, "{0:d}.h5".format(k)) # Catch empty selections: don't map empty sources into the layout of the VDS if all([sel for sel in self.sourceLayout[k]]): layout[idx] = h5py.VirtualSource(fname, self.virtualDatasetNames, shape=self.targetShapes[k]) self.VirtualDatasetLayout = layout self.outFileName = os.path.join(self.virtualDatasetDir, "{0:d}.h5") self.tmpDsetName = self.virtualDatasetNames # Create regular HDF5 dataset for sequential writing else: # The shape of the target depends on trial-averaging if not self.keeptrials: shp = self.cfg["chunkShape"] else: shp = self.outputShape with h5py.File(out.filename, mode="w") as h5f: h5f.create_dataset(name=self.outDatasetName, dtype=self.dtype, shape=shp) self.outFileName = out.filename self.tmpDsetName = self.outDatasetName
def concatenate(file_names_to_concatenate): entry_key = 'data' # where the data is inside of the source files. sh = h5py.File(file_names_to_concatenate[0], 'r')[entry_key].shape # get the first ones shape. layout = h5py.VirtualLayout(shape=(len(file_names_to_concatenate), ) + sh, dtype=np.float64) with h5py.File("VDS.h5", 'w', libver='latest') as f: for i, filename in enumerate(file_names_to_concatenate): vsource = h5py.VirtualSource(filename, entry_key, shape=sh) layout[i, :, :, :] = vsource f.create_virtual_dataset(entry_key, layout, fillvalue=0)
def createResource(cls, directory): filename = os.path.join(directory, "base.h5") extH5FileName = os.path.join(directory, "base__external.h5") extDatFileName = os.path.join(directory, "base__external.dat") externalh5 = h5py.File(extH5FileName, mode="w") externalh5["target/dataset"] = 50 externalh5["target/link"] = h5py.SoftLink("/target/dataset") externalh5["/ext/vds0"] = [0, 1] externalh5["/ext/vds1"] = [2, 3] externalh5.close() numpy.array([0, 1, 10, 10, 2, 3]).tofile(extDatFileName) h5 = h5py.File(filename, mode="w") h5["group/dataset"] = 50 h5["link/soft_link"] = h5py.SoftLink("/group/dataset") h5["link/soft_link_to_group"] = h5py.SoftLink("/group") h5["link/soft_link_to_link"] = h5py.SoftLink("/link/soft_link") h5["link/soft_link_to_file"] = h5py.SoftLink("/") h5["group/soft_link_relative"] = h5py.SoftLink("dataset") h5["link/external_link"] = h5py.ExternalLink(extH5FileName, "/target/dataset") h5["link/external_link_to_link"] = h5py.ExternalLink( extH5FileName, "/target/link") h5["broken_link/external_broken_file"] = h5py.ExternalLink( extH5FileName + "_not_exists", "/target/link") h5["broken_link/external_broken_link"] = h5py.ExternalLink( extH5FileName, "/target/not_exists") h5["broken_link/soft_broken_link"] = h5py.SoftLink("/group/not_exists") h5["broken_link/soft_link_to_broken_link"] = h5py.SoftLink( "/group/not_exists") layout = h5py.VirtualLayout((2, 2), dtype=int) layout[0] = h5py.VirtualSource("base__external.h5", name="/ext/vds0", shape=(2, ), dtype=int) layout[1] = h5py.VirtualSource("base__external.h5", name="/ext/vds1", shape=(2, ), dtype=int) h5.create_group("/ext") h5["/ext"].create_virtual_dataset("virtual", layout) external = [("base__external.dat", 0, 2 * 8), ("base__external.dat", 4 * 8, 2 * 8)] h5["/ext"].create_dataset("raw", shape=(2, 2), dtype=int, external=external) h5.close() return filename
def tile_h5datasets(dest, name, sources, shape_map, tile_shape, nscandim=1): """Merge datasets in a virtual dataset. :param h5py.Group dest: :param str name: :param list(h5py.Dataset) sources: :param dict shape_map: :param int nscandim: start index of the data dimensions """ dset_shapes = [dset.shape for dset in sources] scan_shapes = [dset_shape[:nscandim] for dset_shape in dset_shapes] # F-order det_shapes = [dset_shape[nscandim:] for dset_shape in dset_shapes] reshaped_scan_shapes = [ shape_map.get(scan_shape, scan_shape) for scan_shape in scan_shapes ] # F-order reshaped_scan_shapes = [s[::-1] for s in reshaped_scan_shapes] # C-order reduced_scan_shapes, reshaped_scan_shapes = zip( *(match_shapes([shape1, shape2[::-1]]) for shape1, shape2 in zip(scan_shapes, reshaped_scan_shapes))) reshaped_scan_shapes = [s[::-1] for s in reshaped_scan_shapes] # C-order tile_shape = tile_shape[::-1] # C-order layout_scan_shape, indices = tile_indices(tile_shape, reshaped_scan_shapes, order="C") layout_shape = layout_scan_shape + max_shape(det_shapes) dtype = sources[0].dtype fillvalue = sources[0].fillvalue layout = h5py.VirtualLayout(shape=layout_shape, dtype=dtype) for layout_idx, dset, reduced_scan_shape, det_shape in zip( indices, sources, reduced_scan_shapes, det_shapes): vsource = h5py.VirtualSource( dset.file.filename, dset.name, shape=dset.shape, dtype=dset.dtype, ) reduced_source_shape = reduced_scan_shape + det_shape det_idx = tuple(slice(0, n) for n in det_shape) if reduced_source_shape != vsource.shape: vsource_idx = tuple(slice(0, n) for n in reduced_source_shape) vsource_idx += det_idx vsource = vsource[vsource_idx] layout_idx += det_idx layout[layout_idx] = vsource dest.create_virtual_dataset(name, layout, fillvalue=fillvalue)
def test_index_layout(self): # Assemble virtual dataset (indexing target) layout = h5.VirtualLayout((100, ), 'i4') inds = [3, 6, 20, 25, 33, 47, 70, 75, 96, 98] filename = osp.join(self.tmpdir, "1.h5") vsource = h5.VirtualSource(filename, 'data', shape=(10, )) layout[inds] = vsource outfile = osp.join(self.tmpdir, 'VDS.h5') # Assembly virtual dataset (indexing source) layout2 = h5.VirtualLayout((6, ), 'i4') inds2 = [0, 1, 4, 5, 8] layout2[1:] = vsource[inds2] # Add virtual datasets to output file and close with h5.File(outfile, 'w', libver='latest') as f: f.create_virtual_dataset('/data', layout, fillvalue=-5) f.create_virtual_dataset('/data2', layout2, fillvalue=-3) # Read data from virtual datasets with h5.File(outfile, 'r') as f: data = f['/data'][()] data2 = f['/data2'][()] # Verify assert_array_equal(data[inds], np.arange(10) * 10) assert_array_equal(data2[1:], [0, 10, 40, 50, 80]) mask = np.zeros(100) mask[inds] = 1 self.assertEqual(data[mask == 0].min(), -5) self.assertEqual(data[mask == 0].max(), -5) self.assertEqual(data2[0], -3)
def setUp(self): self.tmpdir = tempfile.mkdtemp() self.path = osp.join(self.tmpdir, "resize.h5") with h5.File(self.path, "w") as f: source_dset = f.create_dataset("source", data=np.arange(20), shape=(10, 2), maxshape=(None, 2), chunks=(10, 1), fillvalue=-1) self.layout = h5.VirtualLayout((10, 1), np.int, maxshape=(None, 1)) layout_source = h5.VirtualSource(source_dset) self.layout[:h5.UNLIMITED, 0] = layout_source[:h5.UNLIMITED, 1] f.create_virtual_dataset("virtual", self.layout)
def combine_h5(filelist, out_h5file): n_files = len(filelist) n_x = list() n_y = list() for i in range(n_files): with h5py.File(filelist[i], 'r') as f: n_x.append((f['x'].shape[0])) n_y.append(len(f['y'])) x_dim = f['x'].shape[1] tot_n_x = np.sum(n_x) tot_n_y = np.sum(n_y) print(f"Total no. of x = {tot_n_x}") print(f"Total no. of y = {tot_n_y}") print(f"Feature vectors dim = {x_dim}") # Assemble virtual dataset x_layout = h5py.VirtualLayout(shape=(tot_n_x, x_dim), dtype=np.float32) y_layout = h5py.VirtualLayout(shape=(tot_n_y, ), dtype=np.int32) k1 = 0 for i in range(n_files): print(f"Reading {filelist[i]}") range1 = range(k1, k1 + n_x[i]) x_layout[list(range1)] = h5py.VirtualSource(filelist[i], "x", shape=(n_x[i], x_dim)) y_layout[list(range1)] = h5py.VirtualSource(filelist[i], "y", shape=(n_y[i], )) k1 = k1 + n_x[i] # Add virtual dataset to output file with h5py.File(out_h5file, "w", libver="latest") as f: print(f"Writing combined file {out_h5file}") f.create_virtual_dataset("x", x_layout, fillvalue=None) f.create_virtual_dataset("y", y_layout, fillvalue=None)
def concatenate_virtual_h5(input_file_names: List[str], output_name: str, fields: Optional[List[str]] = None): r"""Concatenate HDF5 files into a virtual HDF5 file. Concatenates a list `input_file_names` of HDF5 files containing the same format into a single virtual dataset. Parameters ---------- input_file_names : List[str] List of HDF5 file names to concatenate. output_name : str Name of output virtual HDF5 file. fields : Optional[List[str]] Which dataset fields to concatenate. Will concatenate all fields by default. """ # Open first file to get dataset shape and dtype # Assumes uniform number of data points per file h5_file = h5py.File(input_file_names[0], "r") if not fields: fields = list(h5_file.keys()) # Helper function to output concatenated shape def concat_shape(shape: Tuple[int]) -> Tuple[int]: return (len(input_file_names) * shape[0], *shape[1:]) # Create a virtual layout for each input field layouts = { field: h5py.VirtualLayout( shape=concat_shape(h5_file[field].shape), dtype=h5_file[field].dtype, ) for field in fields } with h5py.File(output_name, "w", libver="latest") as f: for field in fields: for i, filename in enumerate(input_file_names): shape = h5_file[field].shape vsource = h5py.VirtualSource(filename, field, shape=shape) layouts[field][i * shape[0]:(i + 1) * shape[0], ...] = vsource f.create_virtual_dataset(field, layouts[field]) h5_file.close()
def finalize(self): vds_shape = (len(self.__idxs), ) + self.__shape # Assemble virtual dataset layout = h5py.VirtualLayout(shape=vds_shape, dtype=self.__dtype) for key in self.__idxs.keys(): filename = self.__idxs[key] vsource = h5py.VirtualSource(filename, self.__opath, shape=self.__shape) layout[key] = vsource with h5py.File(self.foname, "w", libver="latest") as f: f.create_virtual_dataset(self.dspath, layout, fillvalue=-5) self.LogInfo("finalized, close HDF5 File: " + self.foname) return True
def split(input_h5, output_h5): """Read the data file, create N_FAST * N_SLOW new data sets, then copy the data from the former into the latter and build a VDS""" with h5py.File(input_h5, "r") as fin: frames, slow, fast = fin["data"].shape output_files = [] output_dsets = [] for n in range(N_FAST * N_SLOW): filename = output_h5.replace(".h5", "_%02d.h5" % n) fout = h5py.File(filename, "x") dset = fout.create_dataset( "data", (frames, MOD_SLOW, MOD_FAST), chunks=(1, MOD_SLOW, MOD_FAST), compression=bitshuffle.h5.H5FILTER, compression_opts=(0, bitshuffle.h5.H5_COMPRESS_LZ4), dtype=fin["data"].dtype, ) output_files.append((fout, filename)) output_dsets.append(dset) blit(fin["data"], output_dsets) for fout in output_files: fout[0].close() # create VDS layout = h5py.VirtualLayout(shape=(frames, slow, fast), dtype="i4") for n in range(N_SLOW * N_FAST): s, f = divmod(n, N_FAST) source = h5py.VirtualSource(output_files[n][1], "data", shape=(frames, MOD_SLOW, MOD_FAST)) f0 = f * (MOD_FAST + GAP_FAST) f1 = f0 + MOD_FAST s0 = s * (MOD_SLOW + GAP_SLOW) s1 = s0 + MOD_SLOW layout[:, s0:s1, f0:f1] = source fout = h5py.File(output_h5, "x") data = fout.create_virtual_dataset("data", layout, fillvalue=-1) for k in "image_nr_low", "image_nr_high": data.attrs.create(k, fin["data"].attrs.get(k), dtype="i4")
def join(target_name, feature_names, databases): with h5py.File(target_name, "w") as f: for feat_name in feature_names: lengths = [getattr(db, feat_name).shape[0] for db in databases] dim = set([getattr(db, feat_name).shape[1] for db in databases]).pop() layout = h5py.VirtualLayout(shape=(sum(lengths), dim)) offset = 0 for i, n in enumerate(lengths): vsource = h5py.VirtualSource(databases[i].h5_file, feat_name, shape=(n, dim)) layout[offset:offset + n] = vsource offset += n ds = f.create_virtual_dataset(feat_name, layout) ds.attrs = getattr(databases[0], feat_name).attrs
def combineFiles(fileNames, keys): ''' Use a virtual dataset in a temporary .h5 file to combine files with entries of the same shape, so that they appear to be one contiguous dataset. ''' tmpFile = '/tmp/tmpVDS.h5' if os.path.exists(tmpFile): os.remove(tmpFile) for key in keys: sources = [] totalLength = 0 shape = None for fileName in fileNames: with h5py.File(fileName, 'r') as tmpF: source = h5py.VirtualSource(tmpF[key]) shape = source.shape[1:] totalLength += source.shape[0] sources.append(source) layout = h5py.VirtualLayout(shape = (totalLength,) + tuple(shape), dtype = np.float) offset = 0 for source in sources: length = source.shape[0] layout[offset : offset + length] = source offset += length with h5py.File(tmpFile, 'a', libver = 'latest') as f: f.create_virtual_dataset(key, layout, fillvalue = np.nan) return tmpFile
def construct_virtual_sources(self, task, file_shape): taskname = task['name'] layout = task['layout'] scales = task['scales'] op = task['operator'] virt_layout = h5py.VirtualLayout(shape=file_shape, dtype=op.dtype) for i in range(self.dist.comm_cart.size): file_name = '%s_s%i_p%i.h5' % (self.base_path.stem, self.set_num, i) folder_name = '%s_s%i' % (self.base_path.stem, self.set_num) folder_path = self.base_path.joinpath(folder_name) src_file_name = folder_path.joinpath(file_name).relative_to( self.base_path) gnc_shape, gnc_start, write_shape, write_start, write_count = self.get_write_stats( layout, scales, op.domain, op.tensorsig, index=0, virtual_file=True, rank=i) shape_stop = len(op.tensorsig) + 1 src_shape = file_shape[slice(0, shape_stop)] + layout.local_shape( op.domain, scales, rank=i) start = gnc_start count = write_count spatial_slices = tuple( slice(s, s + c) for (s, c) in zip(start, count)) slices = (slice(None), ) + spatial_slices maxshape = (None, ) + tuple(count) tname = 'tasks/{}'.format(taskname) vsource = h5py.VirtualSource(src_file_name, name=tname, shape=src_shape, maxshape=maxshape) virt_layout[slices] = vsource return virt_layout
def create_virtual_dataset(fdir, files, key): """ construct a virtual dataset, containing multiple h5 files :param fdir: location of virtual dataset :param files: list of files (strings) to be added to virtual dataset :param key: key from .h5 files to add to virtual dataset """ sh = h5py.File(files[0], 'r')[key].shape # get the first ones shape. layout = h5py.VirtualLayout(shape=(len(files), ) + sh, dtype=np.float64) with h5py.File(fdir, 'w', libver='latest') as f: f.create_dataset("index", files) for i, filename in enumerate(files): vsource = h5py.VirtualSource(filename, key, shape=sh) layout[i, ] = vsource f.create_virtual_dataset(key, layout, fillvalue=0) f.close()
def make_virtual_ds(self): # Assemble virtual dataset layout = h5.VirtualLayout((4, 100), 'i4', maxshape=(4, None)) for n in range(1, 5): filename = osp.join(self.tmpdir, "{}.h5".format(n)) vsource = h5.VirtualSource(filename, 'data', shape=(100, )) # Fill the first half with positions 0, 2, 4... from the source layout[n - 1, :50] = vsource[0:100:2] # Fill the second half with places 1, 3, 5... from the source layout[n - 1, 50:] = vsource[1:100:2] outfile = osp.join(self.tmpdir, 'VDS.h5') # Add virtual dataset to output file with h5.File(outfile, 'w', libver='latest') as f: f.create_virtual_dataset('/group/data', layout, fillvalue=-5) return outfile
def create_virtual_layout(self, source_meta): """Create a list of VirtualMaps of raw data to the VDS. Args: source_meta(SourceMeta): Source attributes Returns: list(VirtualMap): Maps describing links between raw data and VDS """ source_shape = source_meta.frames + \ (source_meta.height, source_meta.width) spacing = self.construct_vds_spacing() target_height = source_meta.height * len(self.files) + sum(spacing) target_shape = source_meta.frames + (target_height, source_meta.width) self.logger.debug("VDS metadata:\n" " Shape: %s\n" " Spacing: %s", target_shape, spacing) v_layout = h5.VirtualLayout(target_shape, source_meta.dtype) current_position = 0 for stripe_idx, file_path in enumerate(self.files): v_source = h5.VirtualSource(file_path, name=self.source_node, shape=source_shape, dtype=source_meta.dtype) start = current_position end = start + source_meta.height current_position = end + spacing[stripe_idx] # Hyperslab: All frames for each axis, # Height bounds of stripe, # Entire width v_layout[..., start:end, :] = v_source self.logger.debug("Mapping %s[..., %s:%s, :] to %s[...].", self.name, start, end, file_path.split("/")[-1]) return v_layout