def subsample_data(images, labels, pct, chunk_size=None, shuffle=False, seed=None, labels_id='labels'): hdf5_aux = [] if pct < 1.: # Store the data into an auxiliary HDF5 file filename = 'hdf5_aux_{}'.format(time.time()) da.to_hdf5(filename, {'data': images, labels_id: labels}) # Read HDF5 hdf5_aux1 = h5py.File(filename, 'r') images, labels, hdf5_aux2 = hdf52dask(hdf5_aux1, group=None, chunk_size=chunk_size, shuffle=shuffle, seed=seed, pct=pct, labels_id=labels_id) if hdf5_aux2: hdf5_aux.extend([hdf5_aux1, hdf5_aux2]) else: hdf5_aux.extend([hdf5_aux1]) return images, labels, hdf5_aux
def create_input_chunks(cs, partition, data_dir, file_format): """ cs: chunk shape file_format: file format data_dir: to store the file """ if file_format == "HDF5": file_manager = HDF5_manager() else: print("File format not supported yet. Aborting...") sys.exit(1) print(f"Creating input chunks at {data_dir}") create_empty_dir(data_dir) _slices = ((0,cs[0]), (0,cs[1]), (0,cs[2])) for i in range(partition[0]): for j in range(partition[1]): for k in range(partition[2]): print(f"Creating random array... shape: {cs}") arr = da.random.uniform(size=cs) print(f"Done, converting to float16...") arr = arr.astype(np.float16) out_filename = f'{i}_{j}_{k}.hdf5' print(f"Building {out_filename} with shape {cs}") outfilepath = os.path.join(data_dir, out_filename) print(f"Storing...") da.to_hdf5(outfilepath, '/data', arr, chunks=None, compression=None)
def run(self): os.makedirs(self.output().path) filename_pattern = os.path.join('./data/images', '*.jpg') dsk_images = dask_image.imread.imread(filename_pattern) da.to_hdf5('data/storage' + "/" + 'stored.hdf5', {'/x': dsk_images[0]})
def normalize(self, gropuname1, groupname2): # ## normalize y ## # with h5py.File(self.OUTPATH, mode='r+') as f: for atom in self.MAINCHAIN: # load train_y = da.from_array( f[f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}'], chunks=("auto", 3)) val_y = da.from_array( f[f'/{atom}/{groupname2}/{self.RESPONSE_NAME}'], chunks=("auto", 3)) total_y = da.concatenate([train_y, val_y], axis=0) y_mean = da.mean(total_y.reshape(-1), axis=0).compute() y_std = da.std(total_y.reshape(-1), axis=0).compute() # normalize train_y = da.divide(da.subtract(train_y, y_mean), y_std) val_y = da.divide(da.subtract(val_y, y_mean), y_std) # save da.to_hdf5(self.OUTPATH, f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}', train_y) da.to_hdf5(self.OUTPATH, f'/{atom}/{groupname2}/{self.RESPONSE_NAME}', val_y) f.create_dataset(name=f'/{atom}/normalization', data=np.array([y_mean, y_std])) print(f'[{atom}]\tmean: {y_mean:.3f}\tstd: {y_std:.3f}')
def save_to_hdf5(arr, file_path, physik_cs=None, key='/data', compression=None): """ Save dask array to hdf5 dataset. Arguments: ---------- arr: dask array file_path physik_cs key compression: compression algorithm. If None then compression unabled. """ print(f'Saving a dask array at {file_path}:') print(f'- physik_cs: {physik_cs}') print(f'- key: {key}') print(f'- compression: {compression}') da.to_hdf5(file_path, key, arr, chunks=physik_cs, compression=compression) print(f'Array successfully saved.\n') print(f'Inspecting created file...') with h5py.File(file_path, 'r') as f: inspect_h5py_file(f)
def __call__(self, times=None): """Run the filtering process on this experiment.""" # run over the full range of valid time indices unless specified otherwise tgrid = self.fieldset.gridset.grids[0].time if times is None: times = tgrid.copy() if self.uneven_window: raise NotImplementedError("uneven windows aren't supported") # restrict to period covered by window times = np.array(times) window_left = times - tgrid[0] >= self.window_size window_right = times <= tgrid[-1] - self.window_size times = times[window_left & window_right] da_out = {v: [] for v in self.sample_variables} # do the filtering at each timestep for idx, time in enumerate(times): # returns a dictionary of sample_variable -> dask array filtered = self.filter_step(idx, time) for v, a in filtered.items(): da_out[v].append(a) # dump all to disk da.to_hdf5(self.name + ".h5", {v: da.stack(a) for v, a in da_out.items()})
def save_data(self, path): """Writes the data in the right format to disk""" if self.metadata["input_type"] == "hyperspy": self.data.save(path) else: if isinstance(self.data, da.core.Array): da.to_hdf5(path, f"/{DEFAULT_DATA_FILE}", self.data) else: with h5py.File(path, "w") as f: f.create_dataset(f"{DEFAULT_DATA_FILE}", data=self.data)
def save_arr(arr, storage_type, file_path, key='/data', axis=0, chunks_shape=None): """ Save array to hdf5 dataset or numpy file stack. """ if storage_type == "hdf5": if chunks_shape: da.to_hdf5(file_path, key, arr, chunks=chunks_shape) else: da.to_hdf5(file_path, key, arr) elif storage_type == "numpy": da.to_npy_stack(os.path.join(file_path, "npy/"), arr, axis=axis) return
def estimate_ld(hd5, filename, chromosome, threads, memory): print("Estimating LD for Chromosome", chromosome) dset = hd5['/%s' % chromosome][:] chunks = estimate_chunks(dset.shape, threads, memory) array = da.from_array(dset, chunks=chunks) del dset gc.collect() rho = da.corrcoef(da.ma.masked_invalid(array).T) ** 2 filename='%s_ld.hdf5' % filename da.to_hdf5(filename, '/%s' % chromosome, rho) return chromosome, filename
def create_input_file(shape, dirname, file_manager): """ Creating the original array. """ filename = f'{shape[0]}_{shape[1]}_{shape[2]}_original.hdf5' filepath = os.path.join(dirname, filename) if not os.path.isfile(filepath): arr = da.random.random(size=shape) arr = arr.astype(np.float16) da.to_hdf5(filepath, '/data', arr, chunks=None, compression=None) return filepath
def saveSurface(self, dataSet, comment, dataType, surface): chunk_size = surface.shape #create the group for the new data set self.grid.data.create_group('geometeries/surfaces/' + dataSet) #save the dataType metedata self.grid.data['geometeries/surfaces/' + dataSet].attrs['dataType'] = dataType #save all other metadata self.grid.data['geometeries/surfaces/' + dataSet].attrs['comment'] = comment #save the surface da.to_hdf5(self.grid.fname, 'geometeries/surfaces/' + dataSet + '/surface', da.from_array(surface, chunks=chunk_size))
def shuffle(self, groupname): with h5py.File(self.OUTPATH, mode='r+') as f: for atom in self.MAINCHAIN: X = da.from_array(f[f'/{atom}/{groupname}/{self.EXPLANATORY_NAME}']) Y = da.from_array(f[f'/{atom}/{groupname}/{self.RESPONSE_NAME}']) random_order = np.random.permutation(X.shape[0]) X = da.slicing.shuffle_slice(X, random_order) Y = da.slicing.shuffle_slice(Y, random_order) da.to_hdf5(self.OUTPATH, f'/{atom}/{groupname}/{self.EXPLANATORY_NAME}', X) da.to_hdf5(self.OUTPATH, f'/{atom}/{groupname}/{self.RESPONSE_NAME}', Y) print(f'{atom} shuffled.')
def main(): x0=int(sys.argv[1])-1 y0=int(sys.argv[2])-1 z0=int(sys.argv[3])-1 x1=int(sys.argv[4]) y1=int(sys.argv[5]) z1=int(sys.argv[6]) filepath = '/home/ilknull/Files/Code/HPC-Study-master/Dask/' if(int(sys.argv[7])==1): # print("new") r=int(sys.argv[4]) c=int(sys.argv[5]) h=int(sys.argv[6]) a = da.ones((r,c,h),dtype=np.int16) da.to_hdf5('in.hdf5',{'/a': a}) np.array((r,c,h),dtype=np.int16).tofile(filepath+'meta.bin') dims = np.fromfile(filepath+'meta.bin',dtype=np.int16,count=-1) r = dims[0] c = dims[1] h = dims[2] fpIn = h5py.File('in.hdf5',mode='r+') procArrays=[] ar = fpIn['/a'] # print(ar.shape) # print('types: ',type(ar),type(ar[0])) for z in range(h): slic = ar[z] slic = delayed(proc)(slic,z,r,c,x0,x1,y0,y1,z0,z1).compute() procArrays.append(slic) procDask = da.stack(procArrays) print(procDask) print(procDask.compute()) da.to_hdf5('out.hdf5',{'/arr',procDask})
def write_h5(uri, path, data): import h5py import dask.array as da # NOTE # This failed to work, causing h5py cannot be pickled error. # can't pickle local object when doing a to_hdf5 when using dask.distributed # https://github.com/dask/distributed/issues/927 da.to_hdf5(uri, path, data) # logger.info(f"uri={uri}, path={path}, data={data.shape},{data.dtype}") # with h5py.File(uri, "w") as h5: # dst = h5.create_dataset(path, shape=data.shape, dtype=data.dtype) # store(data, dst) return uri
def assignMprops(self): #assign static, figure out dynamic assignment later static = True if (static): for units in self.mdata["UNITS"].keys(): #load up the current grid obj print(units) vp = da.from_array(self.gridObj.data["/points/vp"], chunks=(self.gridObj.pointx.chunksize)) vs = da.from_array(self.gridObj.data["/points/vs"], chunks=(self.gridObj.pointx.chunksize)) p = da.from_array(self.gridObj.data["/points/p"], chunks=(self.gridObj.pointx.chunksize)) qp = da.from_array(self.gridObj.data["/points/qp"], chunks=(self.gridObj.pointx.chunksize)) qs = da.from_array(self.gridObj.data["/points/qs"], chunks=(self.gridObj.pointx.chunksize)) #where is there a valid unit that matches the current unit or one that hasnt been assigned? unitAssignments = da.where( da.logical_and(self.gridObj.pointunit == int(units), vp == -666), True, False) #note that unit assignment is NOT a list of indexes as you would normally expect, it behaves more like # a np.where operator which is why I need to reset it before each assignment vp[unitAssignments] = self.mdata["UNITS"][units]["VP"] unitAssignments = da.where( da.logical_and(self.gridObj.pointunit == int(units), vs == -666), True, False) vs[unitAssignments] = self.mdata["UNITS"][units]["VS"] unitAssignments = da.where( da.logical_and(self.gridObj.pointunit == int(units), p == -666), True, False) p[unitAssignments] = self.mdata["UNITS"][units]["P"] unitAssignments = da.where( da.logical_and(self.gridObj.pointunit == int(units), qp == -666), True, False) qp[unitAssignments] = self.mdata["UNITS"][units]["QP"] unitAssignments = da.where( da.logical_and(self.gridObj.pointunit == int(units), qs == -666), True, False) qs[unitAssignments] = self.mdata["UNITS"][units]["QS"] #save all of the changes to the arrays (for this cycle) da.to_hdf5(self.gridObj.fname, "/points/vp", vp) da.to_hdf5(self.gridObj.fname, "/points/vs", vs) da.to_hdf5(self.gridObj.fname, "/points/p", p) da.to_hdf5(self.gridObj.fname, "/points/qp", qp) da.to_hdf5(self.gridObj.fname, "/points/qs", qs)
def create_input_chunks_distributed(cs, partition, data_dir, file_format): """ for HDF5 only for now cs: chunk shape file_format: file format data_dir: to store the file """ if not file_format == "HDF5": print("File format not supported yet. Aborting...") sys.exit(1) for i in range(6): for filename in os.listdir('/disk' + str(i) + '/gtimothee'): if filename.endswith(".json") or filename.endswith(".hdf5"): os.remove(os.path.join('/disk' + str(i) + '/gtimothee', filename)) print(f"Creating input chunks...") disk_index = 0 repartition_dict = dict() for i in range(partition[0]): for j in range(partition[1]): for k in range(partition[2]): print(f"Creating random array... shape: {cs}") arr = da.random.uniform(size=cs) print(f"Done, converting to float16...") arr = arr.astype(np.float16) out_filename = f'{i}_{j}_{k}.hdf5' print(f"Building {out_filename} with shape {cs}") data_dirpath = os.path.join('/disk' + str(disk_index), 'gtimothee') outfilepath = os.path.join(data_dirpath, out_filename) print(f"Storing on {data_dirpath}...") da.to_hdf5(outfilepath, '/data', arr, chunks=None, compression=None) repartition_dict[str((i,j,k))] = outfilepath disk_index += 1 if disk_index == 6: disk_index = 0 print(f"Writing repartition file...") json_file = os.path.join('/disk0', 'gtimothee', 'repartition_dict.json') if os.path.isfile(json_file): os.remove(json_file) with open(json_file, 'w+') as outfile: json.dump(repartition_dict, outfile)
def save_as_hdf5(x, path, progress=False): if is_dask(x): if progress: with ProgressBar(): da.to_hdf5(path, 'data', x, compression="gzip", compression_opts=9) else: da.to_hdf5(path, 'data', x, compression="gzip", compression_opts=9) else: f = h5py.File(path, 'w') f.create_dataset('data', data=x, compression="gzip", compression_opts=9)
def uncompress_to_hdf5(): print('Writing to hdf5 file after loading raw data in RAM.') raw_arr = uncompress() # create dask array from data in RAM arr = da.from_array(raw_arr, chunks=(1400, 1400, 350)) # write to numpy stack out_filepath = 'data/out.hdf5' if os.path.isfile(out_filepath): os.remove(out_filepath) out_file_path = "outputs/load_raw_write_hdf5_uncompressed.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=None) print( f'time to save the array to hdf5 without compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path) # write to numpy stack out_filepath = 'data/out.hdf5' os.remove(out_filepath) out_file_path = "outputs/load_raw_write_hdf5_commpressed.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=None, compression="gzip") print( f'time to save the array to hdf5 with compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path)
def test_to_hdf5(): try: import h5py except ImportError: return x = da.ones((4, 4), chunks=(2, 2)) y = da.ones(4, chunks=2, dtype='i4') with tmpfile('.hdf5') as fn: x.to_hdf5(fn, '/x') with h5py.File(fn) as f: d = f['/x'] assert eq(d[:], x) assert d.chunks == (2, 2) with tmpfile('.hdf5') as fn: x.to_hdf5(fn, '/x', chunks=None) with h5py.File(fn) as f: d = f['/x'] assert eq(d[:], x) assert d.chunks is None with tmpfile('.hdf5') as fn: x.to_hdf5(fn, '/x', chunks=(1, 1)) with h5py.File(fn) as f: d = f['/x'] assert eq(d[:], x) assert d.chunks == (1, 1) with tmpfile('.hdf5') as fn: da.to_hdf5(fn, {'/x': x, '/y': y}) with h5py.File(fn) as f: assert eq(f['/x'][:], x) assert f['/x'].chunks == (2, 2) assert eq(f['/y'][:], y) assert f['/y'].chunks == (2,)
def save_arr(arr, storage_type, file_path, key='/data', axis=0, chunks_shape=None, compression=None): """ Save dask array to hdf5 dataset or numpy file stack. """ if storage_type == "hdf5": if chunks_shape: print(f'Using chunk shape {chunks_shape}') da.to_hdf5(file_path, key, arr, chunks=chunks_shape) else: if compression == "gzip": print('Using gzip compression') da.to_hdf5(file_path, key, arr, chunks=None, compression="gzip") else: print('Without compression') da.to_hdf5(file_path, key, arr, chunks=None) elif storage_type == "numpy": da.to_npy_stack(os.path.join(file_path, "npy/"), arr, axis=axis)
def test_to_hdf5(): try: import h5py except ImportError: return x = da.ones((4, 4), chunks=(2, 2)) y = da.ones(4, chunks=2, dtype='i4') with tmpfile('.hdf5') as fn: x.to_hdf5(fn, '/x') with h5py.File(fn) as f: d = f['/x'] assert eq(d[:], x) assert d.chunks == (2, 2) with tmpfile('.hdf5') as fn: x.to_hdf5(fn, '/x', chunks=None) with h5py.File(fn) as f: d = f['/x'] assert eq(d[:], x) assert d.chunks is None with tmpfile('.hdf5') as fn: x.to_hdf5(fn, '/x', chunks=(1, 1)) with h5py.File(fn) as f: d = f['/x'] assert eq(d[:], x) assert d.chunks == (1, 1) with tmpfile('.hdf5') as fn: da.to_hdf5(fn, {'/x': x, '/y': y}) with h5py.File(fn) as f: assert eq(f['/x'][:], x) assert f['/x'].chunks == (2, 2) assert eq(f['/y'][:], y) assert f['/y'].chunks == (2, )
def onthefly_to_hdf5(): print('Writing to hdf5 file without loading raw data in RAM.') # write to numpy stack out_filepath = 'data/out.hdf5' if os.path.isfile(out_filepath): os.remove(out_filepath) out_file_path = "outputs/write_hdf5.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=(1400, 1400, 350), compression="gzip") print( f'time to save the array to hdf5 with compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path)
#file_out_str = "test%d_numEvent1.hdf5"%label #file_out_str = "%s/%s_IMGall_RH%d_n%d_label%d.hdf5"%(eosDir,decay,int(scale[0]),neff,label) #file_out_str = "%s/%s_IMG_RH%d_n%dk.hdf5"%(eosDir,decay,int(scale[0]),neff//1000.) #file_out_str = "%s/%s_IMG_EBEEHBup_RH%d_n%dk.hdf5"%(eosDir,decay,int(scale[0]),neff//1000.) #file_out_str = "%s/%s_IMG_RH%d-%d_n%dk.hdf5"%(eosDir,decay,int(scale[0]),int(scale[1]),neff//1000.) print " >> Writing to:", file_out_str if os.path.isfile(file_out_str): os.remove(file_out_str) #da.to_hdf5(file_out_str, {'/X_EB': X_EB, 'X_EEm': X_EEm, 'X_EEp': X_EEp, 'X_HBHE': X_HBHE, '/y': y}, compression='lzf') #da.to_hdf5(file_out_str, {'/X': X_EB, 'X_EEm': X_EEm, 'X_EEp': X_EEp, 'X_HBHE': X_HBHE, '/y': y}, compression='lzf') da.to_hdf5(file_out_str, { #'eventId': eventId, #'runId': runId, 'X_ECAL': X_ECAL, #'X_ECAL_EEup': X_ECAL_EEup, 'X_ECAL_stacked': X_ECAL_stacked, 'X_EB': X_EB, 'X_EEm': X_EEm, 'X_EEp': X_EEp, 'X_HBHE': X_HBHE, #'X_HBHE_EM': X_HBHE_EM, 'X_HBHE_EB_up': X_HBHE_EB_up, 'jetSeed_iphi': jetSeed_iphi, 'jetSeed_ieta': jetSeed_ieta, 'X_jets': X_jets, 'jetPt': jetPt, 'jetM': jetM, '/y': y }, compression='lzf') print " >> Done.\n"
# subtract mean axes = tuple(np.arange(b1.ndim, dtype=int)[b1.ndim//2:]) b1 -= b1.mean(axis=axes, keepdims=True) b2 -= b2.mean(axis=axes, keepdims=True) # numerator of corrcoef numerator = np.multiply(b1, b2).mean(axis=axes, keepdims=False) # denomenator of corrcoef dof = np.prod( b1.shape[slice(axes[0], axes[-1]+1)] ) b1_std = np.sqrt( (b1**2).mean(axis=axes, keepdims=False) / dof ) b2_std = np.sqrt( (b2**2).mean(axis=axes, keepdims=False) / dof ) denominator = np.multiply(b1_std, b2_std) # divide out = np.divide(numerator, denominator) return out if __name__ == '__main__': f1 = h5py.File("test.h5", "r") f2 = h5py.File("test2.h5", "r") arr1 = da.from_array(f1["arr"]) arr2 = da.from_array(f2["arr"]) block_shape = (10, 10) with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof,\ ProgressBar(): out = da.map_blocks(corrcoef, arr1, arr2, block_shape, chunks=(400, 400)) da.to_hdf5("out.h5", "/arr", out) visualize([prof, rprof])
for j, decay in enumerate(decays): #file_in_str = "%s/%s_IMGCROPS_n249k_RH.hdf5"%(eosDir,decay) #file_in_str = "%s/%s_IMGCROPS_n249k_pT.hdf5"%(eosDir,decay) file_in_str = "%s/%s_n250k_IMG_RHraw.hdf5" % (eosDir, decay) dset = h5py.File(file_in_str) X_in = da.from_array(dset['/X'], chunks=chunk_shape) y_in = da.from_array(dset['/y'], chunks=(chunk_size, )) assert X_in.shape[0] == y_in.shape[0] events = X_in.shape[0] #events = 10000 assert events % chunk_size == 0 print " >> Doing decay:", decay print " >> Input file:", file_in_str print " >> Total events:", events print " >> Processing..." X = da.concatenate([da.from_delayed(\ process_chunk(X_in[i:i+chunk_size]),\ shape=(chunk_size,170,360,2),\ dtype=np.float32)\ for i in range(0,events,chunk_size)]) #file_out_str = "%s/%s_IMGCROPS_n249k_pT_RHv%d.hdf5"%(eosDir,decay,ver) file_out_str = "%s/%s_n250k_IMG_RHv%d.hdf5" % (eosDir, decay, ver) print " >> Writing to:", file_out_str da.to_hdf5(file_out_str, {'/X': X, '/y': y_in}, compression='lzf') print " >> Done.\n"
branches = ["m0"] m0 = da.concatenate([\ da.from_delayed(\ load_single(tree,i,i+chunk_size, branches),\ shape=(chunk_size,),\ dtype=np.float32)\ for i in range(0,neff,chunk_size)]) print " >> Expected shape:", m0.shape # Class label label = j #label = 1 print " >> Class label:", label y = da.from_array(\ np.full(X.shape[0], label, dtype=np.float32),\ chunks=(chunk_size,)) file_out_str = "%s/%s_FC_n%dk_label%d.hdf5" % (eosDir, decay, neff // 1000., label) #file_out_str = "test.hdf5" print " >> Writing to:", file_out_str da.to_hdf5(file_out_str, { '/X': X, '/y': y, 'eventId': eventId, 'm0': m0 }, compression='lzf') print " >> Done.\n"
def write_main_dataset(h5_parent_group, main_data, main_data_name, quantity, units, pos_dims, spec_dims, main_dset_attrs=None, h5_pos_inds=None, h5_pos_vals=None, h5_spec_inds=None, h5_spec_vals=None, aux_spec_prefix='Spectroscopic_', aux_pos_prefix='Position_', verbose=False, **kwargs): """ Writes the provided data as a 'Main' dataset with all appropriate linking. By default, the instructions for generating the ancillary datasets should be specified using the pos_dims and spec_dims arguments as dictionary objects. Alternatively, if both the indices and values datasets are already available for either/or the positions / spectroscopic, they can be specified using the keyword arguments. In this case, fresh datasets will not be generated. Parameters ---------- h5_parent_group : :class:`h5py.Group` Parent group under which the datasets will be created main_data : numpy.ndarray, dask.array.core.Array, list or tuple 2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset. If creating an empty dataset - the dtype must be specified via a kwarg. main_data_name : String / Unicode Name to give to the main dataset. This cannot contain the '-' character. quantity : String / Unicode Name of the physical quantity stored in the dataset. Example - 'Current' units : String / Unicode Name of units for the quantity stored in the dataset. Example - 'A' for amperes pos_dims : Dimension or array-like of Dimension objects Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Position indices and values datasets spec_dims : Dimension or array-like of Dimension objects Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Spectroscopic indices and values datasets main_dset_attrs : dictionary, Optional Dictionary of parameters that will be written to the main dataset. Do NOT include region references here. h5_pos_inds : h5py.Dataset, Optional Dataset that will be linked with the name "Position_Indices" h5_pos_vals : h5py.Dataset, Optional Dataset that will be linked with the name "Position_Values" h5_spec_inds : h5py.Dataset, Optional Dataset that will be linked with the name "Spectroscopic_Indices" h5_spec_vals : h5py.Dataset, Optional Dataset that will be linked with the name "Spectroscopic_Values" aux_spec_prefix : str or unicode, Optional Default prefix for Spectroscopic datasets. Default = "Spectroscopic" aux_pos_prefix : str or unicode, Optional Default prefix for Position datasets. Default = "Position" verbose : bool, Optional, default=False If set to true - prints debugging logs kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other arguments this way Returns ------- h5_main : USIDataset Reference to the main dataset """ def __check_anc_before_creation(aux_prefix, dim_type='pos'): aux_prefix = validate_single_string_arg(aux_prefix, 'aux_' + dim_type + '_prefix') if not aux_prefix.endswith('_'): aux_prefix += '_' if '-' in aux_prefix: warn( 'aux_' + dim_type + ' should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(aux_prefix, aux_prefix.replace('-', '_'))) aux_prefix = aux_prefix.replace('-', '_') for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']: if dset_name in h5_parent_group.keys(): raise KeyError('Dataset named: ' + dset_name + ' already exists in group: ' '{}'.format(h5_parent_group.name)) return aux_prefix if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_parent_group should be a h5py.File or h5py.Group object') if not is_editable_h5(h5_parent_group): raise ValueError('The provided file is not editable') if verbose: print('h5 group and file OK') quantity, units, main_data_name = validate_string_args( [quantity, units, main_data_name], ['quantity', 'units', 'main_data_name']) if verbose: print('quantity, units, main_data_name all OK') quantity = quantity.strip() units = units.strip() main_data_name = main_data_name.strip() if '-' in main_data_name: warn( 'main_data_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(main_data_name, main_data_name.replace('-', '_'))) main_data_name = main_data_name.replace('-', '_') if isinstance(main_data, (list, tuple)): if not contains_integers(main_data, min_val=1): raise ValueError( 'main_data if specified as a shape should be a list / tuple of integers >= 1' ) if len(main_data) != 2: raise ValueError( 'main_data if specified as a shape should contain 2 numbers') if 'dtype' not in kwargs: raise ValueError( 'dtype must be included as a kwarg when creating an empty dataset' ) _ = validate_dtype(kwargs.get('dtype')) main_shape = main_data if verbose: print('Selected empty dataset creation. OK so far') elif isinstance(main_data, (np.ndarray, da.core.Array)): if main_data.ndim != 2: raise ValueError('main_data should be a 2D array') main_shape = main_data.shape if verbose: print('Provided numpy or Dask array for main_data OK so far') else: raise TypeError( 'main_data should either be a numpy array or a tuple / list with the shape of the data' ) if h5_pos_inds is not None and h5_pos_vals is not None: # The provided datasets override fresh building instructions. validate_anc_h5_dsets(h5_pos_inds, h5_pos_vals, main_shape, is_spectroscopic=False) if verbose: print('Provided h5 position indices and values OK') else: aux_pos_prefix = __check_anc_before_creation(aux_pos_prefix, dim_type='pos') pos_dims = validate_dimensions(pos_dims, dim_type='Position') validate_dims_against_main(main_shape, pos_dims, is_spectroscopic=False) if verbose: print('Passed all pre-tests for creating position datasets') h5_pos_inds, h5_pos_vals = write_ind_val_dsets( h5_parent_group, pos_dims, is_spectral=False, verbose=verbose, base_name=aux_pos_prefix) if verbose: print('Created position datasets!') if h5_spec_inds is not None and h5_spec_vals is not None: # The provided datasets override fresh building instructions. validate_anc_h5_dsets(h5_spec_inds, h5_spec_vals, main_shape, is_spectroscopic=True) if verbose: print('Provided h5 spectroscopic datasets were OK') else: aux_spec_prefix = __check_anc_before_creation(aux_spec_prefix, dim_type='spec') spec_dims = validate_dimensions(spec_dims, dim_type='Spectroscopic') validate_dims_against_main(main_shape, spec_dims, is_spectroscopic=True) if verbose: print('Passed all pre-tests for creating spectroscopic datasets') h5_spec_inds, h5_spec_vals = write_ind_val_dsets( h5_parent_group, spec_dims, is_spectral=True, verbose=verbose, base_name=aux_spec_prefix) if verbose: print('Created Spectroscopic datasets') if h5_parent_group.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn( 'This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed' ) if isinstance(main_data, np.ndarray): # Case 1 - simple small dataset h5_main = h5_parent_group.create_dataset(main_data_name, data=main_data, **kwargs) if verbose: print('Created main dataset with provided data') elif isinstance(main_data, da.core.Array): # Case 2 - Dask dataset # step 0 - get rid of any automated dtype specification: _ = kwargs.pop('dtype', None) # step 1 - create the empty dataset: h5_main = h5_parent_group.create_dataset(main_data_name, shape=main_data.shape, dtype=main_data.dtype, **kwargs) if verbose: print('Created empty dataset: {} for writing Dask dataset: {}'. format(h5_main, main_data)) print( 'Dask array will be written to HDF5 dataset: "{}" in file: "{}"' .format(h5_main.name, h5_main.file.filename)) # Step 2 - now ask Dask to dump data to disk da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data}) # main_data.to_hdf5(h5_main.file.filename, h5_main.name) # Does not work with python 2 for some reason else: # Case 3 - large empty dataset h5_main = h5_parent_group.create_dataset(main_data_name, main_data, **kwargs) if verbose: print('Created empty dataset for Main') write_simple_attrs(h5_main, {'quantity': quantity, 'units': units}) if verbose: print('Wrote quantity and units attributes to main dataset') if isinstance(main_dset_attrs, dict): write_simple_attrs(h5_main, main_dset_attrs) if verbose: print('Wrote provided attributes to main dataset') write_book_keeping_attrs(h5_main) # make it main link_as_main(h5_main, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals) if verbose: print('Successfully linked datasets - dataset should be main now') from ..usi_data import USIDataset return USIDataset(h5_main)
#file_out_str = "%s/%s_IMG_RH%d_n%dk_label%d.hdf5"%(eosDir,decay,int(scale),neff//1000.,label) #file_out_str = "%s/%s_IMGcropV4_RH%d_n%dkx2_wgt.hdf5"%(eosDir,decay,int(scale),neff//1000.) #file_out_str = "%s/%s_IMG/%s_IMG_RH%d_n%d_%d.hdf5"%(eosDir,decay,decay,int(scale),neff*2,n) file_out_str = "%s_IMG_RH%d_n%d_%d.hdf5" % (decay, int(scale), neff * 2, n) #file_out_str = "test.hdf5" print " >> Writing to:", file_out_str #da.to_hdf5(file_out_str, {'/X': X, '/y': y, 'eventId': eventId, 'X_crop0': X_crop0, 'X_crop1': X_crop1}, compression='lzf') da.to_hdf5( file_out_str, { '/X': X, '/y': y, #'eventId': eventId, 'X_crop0': X_crop0, 'X_crop_stack0': X_crop_stack0, #'X_crop1': X_crop1 #'X_p4': X_p4 'y_mass': y_mass0, 'y_pT': y_pT0, 'y_DR': y_DR0, #'pho_pT0': pho_pT0, #'pho_E0': pho_E0, #'pho_eta0': pho_eta0 #'wgt': wgt }, compression='lzf') print " >> Done.\n"
for i in range(0,neff,chunk_size)]) print " >> Expected shape:", X_HBHE.shape # Class label label = j #label = 1 print " >> Class label:", label y = da.from_array(\ np.full(X_EB.shape[0], label, dtype=np.float32),\ chunks=(chunk_size,)) #file_out_str = "%s/%s_IMG_RHraw_n%dk.hdf5"%(eosDir,decay,neff//1000.) #file_out_str = "%s/%s_IMG_RHv1_n%dk.hdf5"%(eosDir,decay,neff//1000.) #file_out_str = "%s/%s_IMG_RH%d_n%dk.hdf5"%(eosDir,decay,int(rescaler),neff//1000.) #file_out_str = "%s/%s_IMG_RH%d-%d_n%dk.hdf5"%(eosDir,decay,int(scale[0]),int(scale[1]),neff//1000.) file_out_str = "%s/TEST_%s_IMG_RH%d-%d_n%dk.hdf5" % ( eosDir, decay, int(scale[0]), int(scale[1]), neff // 1000.) print " >> Writing to:", file_out_str #da.to_hdf5(file_out_str, {'/X': X, '/y': y}, chunks=(chunk_size,s,s,2), compression='lzf') #da.to_hdf5(file_out_str, {'/X_EB': X_EB, 'X_EEm': X_EEm, 'X_EEp': X_EEp, '/y': y}, compression='lzf') da.to_hdf5(file_out_str, { '/X_EB': X_EB, 'X_EEm': X_EEm, 'X_EEp': X_EEp, 'X_HBHE': X_HBHE, '/y': y }, compression='lzf') print " >> Done.\n"
for seed in xrange(1, 6): idx_under = np.empty((0, ), dtype=int) random_state = check_random_state(None) for target_class in np.unique(y_label): if target_class == 7: n_samples = 10000 index_class = random_state.choice(range( np.count_nonzero(y_label == target_class)), size=n_samples) else: index_class = slice(None) idx_under = np.concatenate( (idx_under, np.flatnonzero(y_label == target_class)[index_class]), axis=0) sub_X = safe_indexing(trainSet_X, idx_under) sub_y = safe_indexing(trainSet_y, idx_under) sub_label = safe_indexing(y_label, idx_under) print('sub counter') print(sorted(Counter(sub_label).items())) trainSet_X_seed = da.from_array(sub_X, chunks=128) trainSet_y_seed = da.from_array(sub_y, chunks=128) da.to_hdf5( 'train_LRCL_informed_window' + str(SLIDING_WINDOW_LENGTH) + '_seed' + str(seed) + '.h5', { 'data': trainSet_X_seed, 'label': trainSet_y_seed })
jetEventId = jetEventId_[runMask][:nJets] print " >> %s: %s"%('jetEventId', jetEventId.shape) jetRunId = jetRunId_[runMask][:nJets] print " >> %s: %s"%('jetRunId', jetRunId.shape) X_ECAL_stacked = X_ECAL_stacked_[runMask][:nJets] print " >> %s: %s"%('X_ECAL_stacked', X_ECAL_stacked.shape) y_jets = y_jets_[runMask][:nJets] print " >> %s: %s"%('y_jets', y_jets.shape) #file_out_str = "test_jets.hdf5" file_out_str = "%s/%s/%s_n%d_label%d_run%d.hdf5"%(eosDir, decay, decay, nJets, label, i) #file_out_str = "%s/%s/%s_n%d_label%d_jet%d_run%d.hdf5"%(eosDir, decay, decay, nJets, label, ijet, i) print " >> Writing to:", file_out_str da.to_hdf5(file_out_str, { #'runId': runId, #'lumiId': lumiId, #'eventId': eventId, 'X_ECAL_stacked': X_ECAL_stacked, #'y': y, 'jetRunId': jetRunId, 'jetEventId': jetEventId, #'jetSeed_iphi': jetSeed_iphi, #'jetSeed_ieta': jetSeed_ieta, #'jetM': jetM, #'jetPt': jetPt, #'X_jets': X_jets, 'y_jets': y_jets }, compression='lzf') print " >> Done.\n"
import h5py from glob import glob import os filenames = sorted(glob(os.path.join('data', 'weather-big', '*.hdf5'))) dsets = [h5py.File(filename, mode='r')['/t2m'] for filename in filenames] import dask.array as da arrays = [da.from_array(dset, chunks=(500, 500)) for dset in dsets] x = da.stack(arrays, axis=0) result = x[:, ::2, ::2] da.to_hdf5(os.path.join('data', 'myfile.hdf5'), '/output', result)