def gather_videos_vqp(fd: h5py.File): """Specified for VQP""" root = Path(FLAGS.input_dir) glob = FLAGS.glob or '*' inputs = sorted(root.glob(glob)) candidates = set(i.parent for i in filter(lambda f: f.is_file(), inputs)) frames_info = {} for p in tqdm.tqdm(candidates): seq = [ Image.open(f) for f in filter(lambda f: f.is_file(), sorted(p.rglob('*'))) ] cube = np.stack(seq) if FLAGS.data_format == 'channels_first': cube = cube.transpose([0, 3, 1, 2]) cube = np.expand_dims(cube, 0) path = p.relative_to(root) # ugly path = path.parent / path.stem.split('_')[0] key = str(path.as_posix()) if not key in fd: fd.create_dataset(key, data=cube, maxshape=(52, ) + cube.shape[1:], compression=FLAGS.compression) frames_info[key] = len(seq) else: d = fd[key] cnt = d.shape[0] + 1 d.resize(cnt, 0) d[-1] = cube del cube
class TestSimpleSlicing(TestCase): """ Feature: Simple NumPy-style slices (start:stop:step) are supported. """ def setUp(self): self.f = File(self.mktemp(), 'w') self.arr = np.arange(10) self.dset = self.f.create_dataset('x', data=self.arr) def tearDown(self): if self.f: self.f.close() def test_negative_stop(self): """ Negative stop indexes work as they do in NumPy """ self.assertArrayEqual(self.dset[2:-2], self.arr[2:-2]) def test_write(self): """Assigning to a 1D slice of a 2D dataset """ dset = self.f.create_dataset('x2', (10, 2)) x = np.zeros((10, 1)) dset[:, 0] = x[:, 0] with self.assertRaises(TypeError): dset[:, 1] = x
def test_create_with_space_strategy(self): """ Create file with file space strategy """ fname = self.mktemp() fid = File(fname, 'w', fs_strategy="page", fs_persist=True, fs_threshold=100) self.assertTrue(fid) # Unable to set file space strategy of an existing file with self.assertRaises(ValueError): File(fname, 'a', fs_strategy="page") # Invalid file space strategy type with self.assertRaises(ValueError): File(self.mktemp(), 'w', fs_strategy="invalid") dset = fid.create_dataset('foo', (100, ), dtype='uint8') dset[...] = 1 dset = fid.create_dataset('bar', (100, ), dtype='uint8') dset[...] = 1 del fid['foo'] fid.close() fid = File(fname, 'a') plist = fid.id.get_create_plist() fs_strat = plist.get_file_space_strategy() assert (fs_strat[0] == 1) assert (fs_strat[1] == True) assert (fs_strat[2] == 100) dset = fid.create_dataset('foo2', (100, ), dtype='uint8') dset[...] = 1 fid.close()
def unite(batches, out_dir, out_name, batch_size): num_batches = len(batches) file = File(f'{out_dir}/{out_name}.h5', 'w') x_data = file.create_dataset('x_data', shape=(batch_size * num_batches, 299, 299, 3), dtype=np.uint8) x_data_adv = file.create_dataset('x_data_adv', shape=(batch_size * num_batches, 299, 299, 3), dtype=np.uint8) y_data = file.create_dataset('y_data', shape=(batch_size * num_batches, 1000), dtype=np.float32) for i, batch in enumerate(batches): batch = File(batch, 'r') batch_x, batch_x_adv, batch_y = map(lambda x: batch[x], ('x_data', 'x_data_adv', 'y_data')) offset = i * batch_size x_data[offset:offset + batch_size] = batch_x x_data_adv[offset:offset + batch_size] = batch_x_adv y_data[offset:offset + batch_size] = batch_y batch.close() logger.info(f'{i+1}/{num_batches} batches for {out_name} repacked') file.close()
def add_vanhateren_subset(h5handle: h5py.File, prefix, imglist, imgbase): length_filename = len(imglist[0]) for imgname in imglist: assert len(imgname) == length_filename imglist_dtype = 'S' + str(length_filename) imgarray_all = [] for idx, imgname in enumerate(imglist): with open(os.path.join(imgbase, imgname), 'rb') as f: imgarray = np.fromfile(f, dtype='>u2').reshape((1024, 1536)) assert imgarray.size == 1024 * 1536 imgarray = imgarray[:, 2:-2] # remove black stuff. imgarray_all.append(imgarray) if idx % 100 == 0: print(idx) data_to_write = np.asarray(imgarray_all) print(data_to_write.shape, data_to_write.dtype) h5handle.create_dataset( prefix + '_data', data=data_to_write, chunks=(1, 256, 1532), # basically, make each chunk 1/4 of an image. compression='gzip', shuffle=True, fletcher32=True) h5handle.create_dataset(prefix + '_filelist', data=np.array(imglist, dtype=imglist_dtype))
def test_create_with_space_strategy(self): """ Create file with file space strategy """ fname = self.mktemp() fid = File(fname, 'w', fs_strategy="page", fs_persist=True, fs_threshold=100) self.assertTrue(fid) dset = fid.create_dataset('foo', (100, ), dtype='uint8') dset[...] = 1 dset = fid.create_dataset('bar', (100, ), dtype='uint8') dset[...] = 1 del fid['foo'] fid.close() fid = File(fname, 'a') plist = fid.id.get_create_plist() fs_strat = plist.get_file_space_strategy() assert (fs_strat[0] == 1) assert (fs_strat[1] == True) assert (fs_strat[2] == 100) dset = fid.create_dataset('foo2', (100, ), dtype='uint8') dset[...] = 1 fid.close()
def make_h5py_file(fp, num_entries, classes): f = File(fp, 'w') f.attrs['classes'] = np.array([x.encode() for x in classes]) byte_type = special_dtype(vlen=np.dtype('uint8')) images_dset = f.create_dataset("images", (num_entries,), dtype=byte_type, maxshape=(None,)) masks_dset = f.create_dataset("masks", (num_entries,), dtype=byte_type, maxshape=(None,)) return images_dset, masks_dset, f
def test_visit(self): fname = self.mktemp() fid = File(fname, 'w') fid.create_dataset('foo', (100, ), dtype='uint8') with pytest.raises(TestException, match='throwing exception'): fid.visititems(throwing) fid.close()
def write_contact_map( h5_file: h5py.File, rows: List[np.ndarray], cols: List[np.ndarray], vals: Optional[List[np.ndarray]] = None, ): # Helper function to create ragged array def ragged(data): a = np.empty(len(data), dtype=object) a[...] = data return a # list of np arrays of shape (2 * X) where X varies data = ragged([np.concatenate(row_col) for row_col in zip(rows, cols)]) h5_file.create_dataset( "contact_map", data=data, dtype=h5py.vlen_dtype(np.dtype("int16")), fletcher32=True, chunks=(1,) + data.shape[1:], ) # Write optional values field for contact map. Could contain CA-distances. if vals is not None: data = ragged(vals) h5_file.create_dataset( "contact_map_values", data=data, dtype=h5py.vlen_dtype(np.dtype("float32")), fletcher32=True, chunks=(1,) + data.shape[1:], )
def save_optimizer_state_dict(hf: h5py.File, state_dict: Optional[bytes]) -> None: if state_dict is None: return hf.create_dataset( OPTIMIZER_STATE_DICT_DATASET, data=np.frombuffer(state_dict, dtype=NP_VOID_DTYPE), )
def main(): a = DetPulseCoord() fileid = h5f.create(b"test.h5") x = [1, 3, 3] y = [1., 3., 3, 4., 5, 3., 33.] x = ones((100, 3), dtype=int32) y = ones((100, 7), dtype=float32) z = ones((100, 2), dtype=float32) c = [(x[i], y[i], z[i]) for i in range(100)] data = {a.names[0]: x, a.names[1]: y} dspaceid = h5s.create_simple((1, ), (h5s.UNLIMITED, )) # dset = h5d.create(fileid, a.name, a.type, dspaceid) # dset.write() file = File("test.h5") numpytype = dtype([("coord", int32, (3, )), ("pulse", float32, (7, )), ("EZ", float32, (2, ))]) data = array(c, dtype=numpytype) tid = h5t.C_S1.copy() tid.set_size(6) H5T6 = Datatype(tid) tid.set_size(4) H5T_C_S1_4 = Datatype(tid) file.create_dataset("DetPulseCoord", data=data) file.attrs.create("CLASS", "TABLE", dtype=H5T6) file.attrs.create("FIELD_0_NAME", a.names[0]) file.attrs.create("FIELD_1_NAME", a.names[1]) file.attrs.create("TITLE", "Detpulse coord pair data") file.attrs.create("VERSION", "3.0", dtype=H5T_C_S1_4) file.attrs.create("abstime", 1.45e9, dtype=float64, shape=(1, )) file.attrs.create("nevents", 122421, dtype=float64, shape=(1, )) file.attrs.create("runtime", 125000, dtype=float64, shape=(1, )) file.flush()
def create_datasets_and_return(f: h5py.File): # https://docs.h5py.org/en/stable/high/dataset.html # RGB: 720 x 1280 x 4 color channels x num images timestampset = f.create_dataset( name="timestamp", shape=(1, 1), maxshape=(None, 1), dtype="int64" ) colorset = f.create_dataset( name="color", shape=(1, RGBD_X, RGBD_Y, COLOR_Z), maxshape=(None, RGBD_X, RGBD_Y, COLOR_Z), compression="gzip", # Set this accordingly to trade off FPS and filesize compression_opts=0, dtype="uint8", ) depthset = f.create_dataset( name="depth", shape=(1, RGBD_X, RGBD_Y), maxshape=(None, RGBD_X, RGBD_Y), compression="gzip", # compression_opts=9, dtype="uint16", ) return colorset, depthset, timestampset
def _preprocess_split( self, h5py_file: h5py.File, split_name: str, raw_split_resource: StreamedResource) -> StreamedResource: split_df = pd.read_csv(raw_split_resource, delimiter=",") targets_df = split_df["intent"] sample_texts_df = split_df["tokens"] embedder_instances = [WordEmbeddings("en-glove")] doc_embedders = DocumentPoolEmbeddings(embedder_instances) sample_location = os.path.join(split_name, "samples") target_location = os.path.join(split_name, "targets") embedding_size = sum( [embedder.embedding_length for embedder in embedder_instances]) string_datatype = h5py.string_dtype(encoding='ascii') sample_dset = h5py_file.create_dataset(sample_location, shape=( len(split_df), embedding_size, )) target_dset = h5py_file.create_dataset(target_location, (len(split_df), ), dtype=string_datatype) for i in tqdm.tqdm(range(len(split_df)), desc=f"Embedding {split_name} split"): doc_text = AtisPreprocessor._clean_text(sample_texts_df.iloc[i]) embedded_doc = self._embed_document(doc_text, doc_embedders) sample_dset[i] = embedded_doc target_dset[i] = targets_df.iloc[i]
def write_point_cloud(h5_file: h5py.File, point_cloud: np.ndarray): h5_file.create_dataset( "point_cloud", data=point_cloud, dtype="float32", fletcher32=True, chunks=(1,) + point_cloud.shape[1:], )
def hdf5_writer(filename, data, components=None): """ Write a dataset or a subset to a FITS file. Parameters ---------- data : `~glue.core.data.Data` or `~glue.core.subset.Subset` The data or subset to export components : `list` or `None` The components to export. Set this to `None` to export all components. """ if isinstance(data, Subset): mask = data.to_mask() data = data.data else: mask = None from h5py import File f = File(filename, 'w') for cid in data.main_components + data.derived_components: if components is not None and cid not in components: continue if data.get_kind(cid) == 'categorical': values = data[cid] if values.dtype.kind == 'U': values = np.char.encode(values, encoding='ascii', errors='replace') else: values = values.copy() else: values = data[cid].copy() if mask is not None: if values.ndim == 1: values = values[mask] else: if values.dtype.kind == 'f': values[~mask] = np.nan elif values.dtype.kind == 'i': values[~mask] = 0 elif values.dtype.kind == 'S': values[~mask] = '' else: warnings.warn( "Unknown data type in HDF5 export: {0}".format( values.dtype)) continue f.create_dataset(cid.label, data=values) f.close()
def save_in(store: h5py.File, layer_outputs: List[np.ndarray], metrics: List[Metric], references: pd.DataFrame): """ Save batch data into HDF5 file. """ for index, metric in enumerate(metrics): sample_id = len(references) references.loc[sample_id] = metric for output_index, batch_layer_outputs in enumerate(layer_outputs): layer_output = batch_layer_outputs[index] store.create_dataset(f'outputs/{output_index}/{sample_id}', data=layer_output)
def to_hdf5(self, hdf5: h5py.File) -> None: """""" hdf5.attrs['band'] = self.band if self.antenna is not None: hdf5.attrs['antenna'] = self.antenna if self.receiver is not None: hdf5.attrs['receiver'] = self.receiver hdf5.attrs['correlator_efficiency'] = self.correlator_efficiency hdf5.create_dataset('frequency', data=self.frequency, track_times=False) hdf5.create_dataset('coefs', data=self.coefs, track_times=False)
def h5_writer(data, h5_path): from h5py import File from os.path import exists if exists(h5_path): remove(h5_path) f = File(h5_path, 'w') f.create_dataset('default', data=data, compression='gzip', chunks=True, shuffle=True) f.close()
def save_optimizer_state_dict( hf: h5py.File, state_dict: Optional[OptimizerStateDict], ) -> None: if state_dict is None: return with io.BytesIO() as fobj: torch.save(state_dict, fobj) hf.create_dataset(OPTIMIZER_STATE_DICT_DATASET, data=np.frombuffer(fobj.getbuffer(), dtype=NP_VOID_DTYPE))
def hdf5_writer(filename, data, components=None): """ Write a dataset or a subset to a FITS file. Parameters ---------- data : `~glue.core.data.Data` or `~glue.core.subset.Subset` The data or subset to export components : `list` or `None` The components to export. Set this to `None` to export all components. """ if isinstance(data, Subset): mask = data.to_mask() data = data.data else: mask = None from h5py import File f = File(filename, 'w') for cid in data.visible_components: if components is not None and cid not in components: continue comp = data.get_component(cid) if comp.categorical: if comp.labels.dtype.kind == 'U': values = np.char.encode(comp.labels, encoding='ascii', errors='replace') else: values = comp.labels.copy() else: values = comp.data.copy() if mask is not None: if values.ndim == 1: values = values[mask] else: if values.dtype.kind == 'f': values[~mask] = np.nan elif values.dtype.kind == 'i': values[~mask] = 0 elif values.dtype.kind == 'S': values[~mask] = '' else: warnings.warn("Unknown data type in HDF5 export: {0}".format(values.dtype)) continue print(values) f.create_dataset(cid.label, data=values) f.close()
def _save_as_hdf5_rec(cls, obj: Mapping[str, Union[Mapping, np.ndarray]], root: h5py.File): for k, v in obj.items(): if isinstance(v, np.ndarray): root.create_dataset(name=k, data=v) elif isinstance(v, dict): grp = root.create_group(name=k) cls._save_as_hdf5_rec(v, grp) elif isinstance(v, Number): root.create_dataset(name=k, data=v) else: raise ValueError(f'Does not support type {type(v)}')
def _save_values(self, file: h5py.File) -> None: """Save values needed to reproduce fit Args: file (h5py.File): Opened file to save to """ file.create_dataset( self.stain_matrix_key, data=self.stain_matrix_target, compression="gzip", compression_opts=9, )
def write_datasubset( infile: h5py.File, outfile: h5py.File, mask: SWIFTMask, dataset_names: List[str], links_list: List[str], ): """ Writes subset of all datasets contained in snapshot according to specified mask Parameters ---------- infile : h5py.File hdf5 file handle for input snapshot outfile : h5py.File hdf5 file handle for output snapshot mask : SWIFTMask the mask used to define subset that is written to new snapshot dataset_names : list of str names of datasets found in the snapshot links_list : list of str names of links found in the snapshot """ skip_list = links_list.copy() skip_list.extend(["Cells", "SubgridScheme"]) if mask is not None: for name in dataset_names: if any([substr for substr in skip_list if substr in name]): continue # get output dtype and size first_value = infile[name][0] output_type = first_value.dtype output_size = first_value.size mask_size = get_dataset_mask(mask, name, suffix="_size") if output_size != 1: output_shape = (mask_size, output_size) else: output_shape = mask_size dataset_mask = get_dataset_mask(mask, name) subset = read_ranges_from_file( infile[name], dataset_mask, output_shape=output_shape, output_type=output_type, ) # Write the subset outfile.create_dataset(name, data=subset) for attr_name, attr_value in infile[name].attrs.items(): outfile[name].attrs.create(attr_name, attr_value)
def _setup_hdf5(self, h5_file: h5py.File): """Sets up an HDF5 file to work as a database. Parameters ---------- h5_file HDF5 file to set up. Must be opened in write mode. """ if self.label_dtype is None: self.label_dtype = self._default_label_dtype if self.feature_dtype is None: self.feature_dtype = self._default_feature_dtype h5_file.create_dataset('features', shape=(0, 0), dtype=self.feature_dtype, maxshape=(None, None)) h5_file.create_dataset('labels', shape=(0, 0, 0), dtype=self.label_dtype, maxshape=(None, None, None)) h5_file.create_dataset('instance_ids', shape=(0, ), dtype=int, maxshape=(None, )) h5_file.create_dataset('labeller_ids', shape=(0, ), dtype=int, maxshape=(None, )) h5_file.attrs['label_dtype'] = self.label_dtype h5_file.attrs['feature_dtype'] = self.feature_dtype h5_file.attrs['n_features'] = -1 h5_file.attrs['label_dim'] = -1
def compress_and_store( hd5: h5py.File, data: np.ndarray, hd5_path: str, ): """Support function that takes arbitrary input data in the form of a Numpy array and compress, store, and checksum the data in a HDF5 file. Args: hd5 (h5py.File): Target HDF5-file handle. data (np.ndarray): Data to be compressed and saved. hd5_path (str): HDF5 dataframe path for the stored data. """ data = data.copy(order='C') # Required for xxhash compressed_data = blosc.compress(data.tobytes(), typesize=2, cname='zstd', clevel=9) hash_uncompressed = xxhash.xxh128_digest(data) hash_compressed = xxhash.xxh128_digest(compressed_data) decompressed = np.frombuffer(blosc.decompress(compressed_data), dtype=np.uint16).reshape(data.shape) assert (xxhash.xxh128_digest(decompressed) == hash_uncompressed) dset = hd5.create_dataset(hd5_path, data=np.void(compressed_data)) # Store meta data: # 1) Shape of the original tensor # 2) Hash of the compressed data # 3) Hash of the uncompressed data dset.attrs['shape'] = data.shape dset.attrs['hash_compressed'] = np.void(hash_compressed) dset.attrs['hash_uncompressed'] = np.void(hash_uncompressed)
def serialize_samples(self, writer: h5py.File, data_file: str, label_file: str): frames, labels, seq_num, num_samples, names = self._get_samples( data_file, label_file) # store data writer.create_dataset('audio', data=frames) writer.create_dataset('labels', data=labels) # Save meta-data writer.attrs['data_file'] = str(data_file) writer.attrs['label_file'] = str(label_file) writer.attrs['seq_num'] = seq_num writer.attrs['num_samples'] = num_samples #writer.attrs['label_names'] = names[1:] writer.attrs['label_names'] = names
def calculate_distances(name): index_ids, index_vectors = get_data('index', name) test_ids, test_vectors = get_data('test', name) logger.info('data is read') index_vectors, test_vectors = map(arr, (index_vectors, test_vectors)) logger.info('tensors are ready') index_ids = index_ids test_ids = test_ids shape = len(test_ids), len(index_ids) file = File('data/distances.h5', 'w') result = file.create_dataset('result', shape=shape, dtype=np.uint8) logger.info('h5 file is ready') index_vectors = index_vectors.view(-1, SHAPE).cuda() for i in tqdm(np.arange(shape[0]), desc='calculating cosine'): c = cosine(test_vectors[i].view(-1, SHAPE), index_vectors) result[i, :] = c for i, v in tqdm(zip(index_ids, index_vectors), desc='removing empty pics'): if v is None: result[:, i] = 255 file.close()
def _set_outputs(self, output_file: h5py.File, outputs: Union[Tuple, Any]) -> None: """Save the step output to a given h5 file Args: output_file (h5py.File): File to write to outputs (Union[Tuple, Any]): Computed step output """ if not isinstance(outputs, tuple): outputs = tuple([outputs]) for i, output in enumerate(outputs): output_file.create_dataset( f"{self.output_key}_{i}", data=output, compression="gzip", compression_opts=9, )
def write_in_hd5_ukbb( name: str, storage_type: StorageType, value: np.ndarray, hd5: h5py.File, compression: str, ): """Replicates storage behavior in tensor_writer_ukbb""" if storage_type == StorageType.STRING: hd5.create_dataset(name, data=value, dtype=h5py.special_dtype(vlen=str)) elif storage_type == StorageType.CONTINUOUS: hd5.create_dataset(name, data=value, compression=compression) else: raise NotImplementedError( f'{storage_type} cannot be automatically written yet')
def compile_ace_h5(wav_loc, saveloc, ft='.wav', all_single_channel=False): """ Create an HDF5 dataset which contains information about a set of files which describe AIRs of acoustic environments. This file can be used to train DNNs using ace_discriminative_nets.py Args: wav_loc: The location of the wav files as a list saveloc: The location to save to the HDF5 file ft: The file type to look for all_single_channel: Assume that all responses are single channel (faster and does not require soxi) Returns: Nothing """ from utils_base import find_all_ft, run_command try: from os.path import abspath except ImportError: raise from h5py import File all_wavs = find_all_ft(wav_loc, ft=ft, use_find=True) channels = [] for i in range(len(all_wavs)): print('Reading : ' + all_wavs[i]) all_wavs[i] = abspath(all_wavs[i]) if all_single_channel: channels.append('1') else: try: channels.append(run_command('soxi -c ' + all_wavs[i])[0]) except OSError as ME: print( 'I think that soxi is not installed because when i tried to use it to get ' 'the number of channels, i got this ' + ME.message) raise hf = File(saveloc, 'w') hf.create_dataset('filenames', data=all_wavs) hf.create_dataset('chan', data=channels) hf.close() print('Done with : ' + saveloc)
def overwrite_or_create(file: h5py.File, data: np.ndarray, key: str): """ Check if node exists in hdf5 file. If it does exist, overwrite with the given array otherwise create a new dataset. Parameters ---------- file: h5py File object data: Numpy Array key: str Returns ------- None """ if key in file: del file[key] file.create_dataset(key, data=data)
def _gen_histogram(fd: h5py.File, bins: int): # Alright, this has been frustrating: # (I) I cannot use a locally scoped queue as pickle # is not able to pickle the locally bound function (why would # it even do that? It only has to pickle the return value). # (II) I cannot use a manager, because pytorch fails miserably # as for some reason it thinks it has to reinitialize # CUDA even though no tensors are ever used in the forked # process. # (III) I cannot switch the process start_method to 'spawn' because # it was already initialized by modules imported by this module. # Also: changing some global state which has side effect even # regarding other modules is probably a _very_ bad idea. # (IV) When using pathos as a replacement for multiprocessing # it spewed a whole new class of different errors and for the moment # I simply give that up and use global variables. # I hate pickle so much. minimum = fd['dists'].attrs['minimum'] maximum = fd['dists'].attrs['maximum'] rg = int(minimum - 1), int(maximum + 1) print('creating histogram in range [{}, {}]'.format(*rg)) pool = mp.Pool() proc = mp.Process(target=_buffer_stats) proc.start() chunk_size = BUF_SIZE print() # it is not using map or async_map to be able to slow # down the reader on systems with little RAM for chunk in tqdm(range(fd['dists'].shape[0] // chunk_size)): a = chunk * chunk_size b = a + chunk_size dists = fd['dists'][a:b] pool.apply_async(_calc_stats, (dists, bins, rg)) print('\n', 'waiting for workers to finish') pool.close() # harbl pool.join() print('awaiting result') # feed some cyanide _shitq.put(None) proc.join() stats = _shitq.get() ds_hist = fd.create_dataset('histogram', stats.histogram.shape) ds_hist[:] = stats.histogram ds_hist.attrs['bin_edges'] = stats.bin_edges print('finished creating histogram')
def writeData(data, outputFilename): """ Writes data to a tiff, hdf5, or npy file. Parameters ---------- data : 3D numpy array The data to be written. Must have 3 dimensions, i.e. data.ndim == 3 outputFilename : string The absolute or relative location of the particular file to be read in. outputFilename must end in one of the following extensions ['.tif', '.tiff', '.hdf5', '.h5', '.npy']. Notes ----- - Data to be saved must be a 3D array. """ assert data.ndim==3, "Can only write out 3D hdf5, tiff, and numpy files" filename = outputFilename.rstrip('/') basePath, fName = os.path.split(filename) name, ext = os.path.splitext(fName) if basePath and not os.path.exists(basePath): raise IOError, "Directory does not exist: %s" % (basePath) if ext.lower() in ['.npy']: try: np.save(filename, np.array(data,dtype=np.float32)) except IOError: raise IOError, "Error writing npy data to: \"%s\"" % filename elif ext.lower() in ['.h5', '.hdf5']: from h5py import File try: h5File = File(filename, "w") except IOError: raise IOError, "Error creating writable hdf5 file at: \"%s\"" % filename shp = data.shape comp="gzip" compOpts=1 dset = h5File.create_dataset("/raw", shp, np.float32, data, chunks=shp, compression=comp, compression_opts=compOpts) elif ext.lower() in ['.tif', '.tiff']: from libtiff import TIFF try: tiff = TIFF.open(filename, 'w') tiff.write_image(np.array(data,dtype=np.float32)) except IOError: raise IOError, "Error writing tif file at: \"%s\"" % filename tiff.close() else: assert False, "Can only write out 3D hdf5, tiff, and numpy files"
def add_vanhateren_subset(h5handle: h5py.File, prefix, imglist, imgbase): length_filename = len(imglist[0]) for imgname in imglist: assert len(imgname) == length_filename imglist_dtype = 'S' + str(length_filename) imgarray_all = [] for idx, imgname in enumerate(imglist): with open(os.path.join(imgbase, imgname), 'rb') as f: imgarray = np.fromfile(f, dtype='>u2').reshape((1024, 1536)) assert imgarray.size == 1024 * 1536 imgarray = imgarray[:, 2:-2] # remove black stuff. imgarray_all.append(imgarray) if idx % 100 == 0: print(idx) data_to_write = np.asarray(imgarray_all) print(data_to_write.shape, data_to_write.dtype) h5handle.create_dataset(prefix + '_data', data=data_to_write, chunks=(1, 256, 1532), # basically, make each chunk 1/4 of an image. compression='gzip', shuffle=True, fletcher32=True) h5handle.create_dataset(prefix + '_filelist', data=np.array(imglist, dtype=imglist_dtype))
from mpi4py import MPI from h5py import File import numpy as np rank = MPI.COMM_WORLD.Get_rank() numProcs = MPI.COMM_WORLD.Get_size() procsList = np.arange(numProcs) def status(message, ranks=procsList): if rank in ranks: print "%s, process %d/%d: %s" % (time.strftime("%I:%M:%s%p"), rank + 1, numProcs, message) def report(message): status(message, [0]) def reportbarrier(message): MPI.COMM_WORLD.Barrier() report(message) fout = File(fname, "w", driver="mpio", comm=MPI.COMM_WORLD) rows = fout.create_dataset("rows", (numRows, numCols), dtype=np.float64)
from h5py import File import numpy as np # about 3 GB m = int(2e6) n = 200 k = 100 r = 20 W = np.random.random((m,r)) H = np.zeros((r, n)) H[:, :r] = np.eye(r) H[:, r:] = np.random.random((r, n-r)) for i in np.arange(2, 20, 1): temp = H[:, i] H[:, i] = H[:, 10*i] H[:, 10*i] = temp fout = File("testdata.h5", "w") fout.create_dataset("mat", data=W.dot(H)) fout.close()
def create_data_set( data_file: h5py.File, data: pd.DataFrame, sample_rate: int=None, date_time: datetime=datetime.now(), site_id: str='000', lane_id: str='00', temperature: float=None, license_plate: str=None, sensor_calibration_factory: list=None, distance_between_sensors: list=None, sensor_type: str=None, sensors_layout: str=None, channel_configuration: str=None, **kwargs ) -> h5py.Dataset: """ :param data_file: :param data: :param sample_rate: (e.g. 2000) :param date_time: (e.g. 2017-49-04 00:49:36) :param site_id: (e.g. 001) :param lane_id: (e.g. 01) :param temperature: (e.g. 28.5) :param license_plate: (e.g. AAA9999) :param sensor_calibration_factory: (e.g. [0.98, 0.99, 0.75]) :param distance_between_sensors: (e.g. [1.0, 1.5, 2.0]) :param sensor_type: (e.g. quartz, polymer, ceramic, mixed) :param sensors_layout: (e.g. |/|\|<|>|=|) :param channel_configuration: (this is a, optional attribute, it is required just when sensor type is mixed, e.g. "{'a0': 'polymer', 'a1': 'ceramic'})" :param kwargs: :return: """ dset_id = 'run_{}_{}_{}'.format( site_id, lane_id, date_time.strftime('%Y%M%d_%H%M%S') ) dset = data_file.create_dataset( dset_id, shape=(data.shape[0],), dtype=np.dtype([ (k, float) for k in ['index'] + list(data.keys()) ]) ) dset['index'] = data.index for k in data.keys(): dset[k] = data[k] dset.attrs['sample_rate'] = sample_rate dset.attrs['date_time'] = date_time.strftime('%Y-%M-%d %H:%M:%S') dset.attrs['site_id'] = site_id dset.attrs['lane_id'] = lane_id dset.attrs['temperature'] = temperature dset.attrs['license_plate'] = license_plate dset.attrs['sensor_calibration_factory'] = sensor_calibration_factory dset.attrs['distance_between_sensors'] = distance_between_sensors dset.attrs['sensor_type'] = sensor_type dset.attrs['sensors_layout'] = sensors_layout dset.attrs['channel_configuration'] = channel_configuration if kwargs: for k, v in kwargs.items(): dset.attrs[k] = v return dset