def store_all_data(combined_data, cancer_type_list, output_filename): combined_data = combined_data.astype(str) with h5py.File(output_filename, "w") as out_f: dset = out_f.create_dataset("data", combined_data.shape, dtype=h5py.string_dtype('utf-8')) dset[:, :] = combined_data.values rowset = out_f.create_dataset("index", combined_data.index.shape, dtype=h5py.string_dtype('utf-8')) rowset[:] = combined_data.index.values colset = out_f.create_dataset("columns", combined_data.columns.shape, dtype=h5py.string_dtype('utf-8')) colset[:] = combined_data.columns.values ctype_set = out_f.create_dataset("cancer_types", (len(cancer_type_list), ), dtype=h5py.string_dtype('utf-8')) ctype_set[:] = cancer_type_list return
def init_log(self, maxlog: int) -> None: """Init logging interface to hdf5 file. @param maxlog: (initial) maximum log line to reserve in hdf5 file if more space is needed, maxlog more lines will be reserved. unused lines will be removed at run end. @type maxlog: int """ self.maxlog = maxlog self.dlog = maxlog size = MPI_STATUS.size self.logging = self.h5file.create_group("Logging") self.logcount = self.logging.create_dataset("count", (size, ), fillvalue=0, dtype="int32") self.logs = self.logging.create_dataset( "logs", (size, maxlog), maxshape=(size, None), dtype=[ ("level", "int32"), ("time", string_dtype(length=18)), ("runtime", "float32"), ("message", string_dtype(length=self.maxstrlen)), ], ) MPI_GATE.register_function("addlog", self.add_log_line) self._init_log = True
def write_hdf(dataframe, ctype_ls, barcode_ls, filename): with h5py.File(filename, "w") as f: dset = f.create_dataset("data", dataframe.shape, dtype=float) dset[:, :] = dataframe.values columns = f.create_dataset("columns", dataframe.columns.shape, dtype=h5py.string_dtype('utf-8')) columns[:] = dataframe.columns.values idx = f.create_dataset("index", dataframe.index.shape, dtype=h5py.string_dtype('utf-8')) idx[:] = dataframe.index.values ctypes = f.create_dataset("cancer_types", len(ctype_ls), dtype=h5py.string_dtype('utf-8')) ctypes[:] = ctype_ls barcodes = f.create_dataset("barcodes", len(barcode_ls), dtype=h5py.string_dtype('utf-8')) barcodes[:] = barcode_ls return
def writeHDF5(objCollectionExt, extMagData, hdfOutputFileName): f = h5py.File(hdfOutputFileName, 'a') grpTables = f.create_group('Tables') # create object index table nObjects = objCollectionExt.nObj nFilters = objCollectionExt.nFilters objTable = np.empty((nObjects), dtype=[('objName',h5py.string_dtype()), ('indexLo', 'i4'), ('indexHi', 'i4')]) for (i, objName) in enumerate(objCollectionExt.objNames): objTable[i]['objName'] = objName objTable[i]['indexLo'] = objCollectionExt.objSynSlice[objName].start objTable[i]['indexHi'] = objCollectionExt.objSynSlice[objName].stop print(objTable) grpTables.create_dataset('objectTable', data=objTable) # create variable index table varTable = np.empty((4 + nFilters), dtype=[('varName',h5py.string_dtype()), ('index', 'i4')]) varTable[0] = ('Teff', objCollectionExt.offsetTeff) varTable[1] = ('logg', objCollectionExt.offsetLogg) varTable[2] = ('Av', objCollectionExt.offsetAv) varTable[3] = ('DM', objCollectionExt.offsetDM) for (i, filterName) in enumerate(objCollectionExt.filterNames): varTable[i+4] = (filterName, i+4) print(varTable) grpTables.create_dataset('varTable', data=varTable) grpData = f.create_group('Data') grpData.create_dataset('ChainData', data=extMagData) f.close()
def _save_run_results_hdf(outfile, results): # results: model_id timestamp class_labels (bin + roi_numbers) # input_images output_classes output_scores with h5.File(outfile, 'w') as f: meta = f.create_dataset('metadata', data=h5.Empty('f')) meta.attrs['version'] = results['version'] meta.attrs['model_id'] = results['model_id'] meta.attrs['timestamp'] = results['timestamp'] f.create_dataset('output_classes', data=results['output_classes'], compression='gzip', dtype='float16') f.create_dataset('output_scores', data=results['output_scores'], compression='gzip', dtype='float16') f.create_dataset('class_labels', data=np.string_(results['class_labels']), compression='gzip', dtype=h5.string_dtype()) if results['bin_id']: meta.attrs['bin_id'] = results['bin_id'] f.create_dataset('roi_numbers', data=results['roi_numbers'], compression='gzip', dtype='uint16') else: f.create_dataset('input_images', data=np.string_(results['input_images']), compression='gzip', dtype=h5.string_dtype())
def create_plate(h5_file_path: Path, n_images: int): """ Allocate space for the hdf5 arrays on disk for a given plate. """ with h5py.File(h5_file_path, "w") as h5_file: h5_file.attrs["timestamp"] = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") h5_file.attrs["info"] = h5py.version.info h5_file.create_dataset("images", (n_images, ) + constants.IMG_SHAPE, np.float16) h5_file.create_dataset("site", (n_images, ), np.uint8) h5_file.create_dataset("well", (n_images, ), h5py.string_dtype(encoding="utf-8")) h5_file.create_dataset("replicate", (n_images, ), np.uint8) h5_file.create_dataset("plate", (n_images, ), h5py.string_dtype(encoding="utf-8")) h5_file.create_dataset( "compound", (n_images, ), h5py.string_dtype(encoding="utf-8"), ) h5_file.create_dataset("concentration", (n_images, ), np.float16) h5_file.create_dataset("moa", (n_images, ), h5py.string_dtype(encoding="utf-8"))
def dump_data(arr, ctypes, patients, idx, out_hdf): print("Saving results: ", out_hdf) with h5py.File(out_hdf, "w") as f_out: # Store the feature set rows = f_out.create_dataset("index", shape=idx.shape, dtype=h5py.string_dtype('utf-8')) rows[:] = idx leading = 0 lagging = 0 # For each cancer type: for ct, pat in zip(ctypes, patients): leading += len(pat) # Store the data values dset = f_out.create_dataset(ct + "/data", shape=(arr.shape[0], len(pat))) dset[:] = arr[:, lagging:leading] # Store the columns columns = f_out.create_dataset(ct + "/columns", shape=(len(pat), ), dtype=h5py.string_dtype('utf-8')) columns[:] = pat lagging = leading return
def dataToHdf(self, file): #записываем данные в ячейках file.create_dataset('Data', data=self.table_data) # записываем данные о заголовках headerData = np.array(self.columnHeaderData, dtype=h5py.string_dtype(encoding='utf-8')) file.create_dataset('HeaderData', data=headerData, dtype=h5py.string_dtype(encoding='utf-8'))
def create_parameters( self, max: int = 10, method: str = "complete", extra: dict = {}, force_create=False, ) -> str: filename = self.get_dataset_filename("data", "hdf5") if os.path.isfile(filename) and not force_create: print( "Parameter file exists, not recreating (use --regenerate_samples if you want to force)" ) return filename print("+" * 40) print(f"Generating Dataset {self.name}, {max} examples") print(f"Datasets: {self.dataset_dir}") print("+" * 40) # Save out the parameters first self.save_parameters() # Generate the set of samples (could switch to generators, # but need to figure out arbitrary size arrays in HDF5) dataset: List[Sample] = [] if method == "complete": dataset = self.parameters.recursively_generate_all() else: dataset = self.parameters.sample_space(sample_size=max) # Create the data file and add all the points to it with h5py.File(filename, "w") as datafile: # Figure out the sizes to store records = len(dataset) param_size = len(dataset[0].encode()) # Add columns to it filenames = datafile.create_dataset("files", (records, ), dtype=h5py.string_dtype()) parameters = datafile.create_dataset("parameters", (records, ), dtype=h5py.string_dtype()) labels = datafile.create_dataset("labels", (records, param_size)) audio_exists = datafile.create_dataset("audio_exists", (records, ), dtype=np.bool) # Generate the sample points for index, point in enumerate(dataset): params = self.parameters.to_settings(point) filenames[index] = self.get_wave_filename(index) labels[index] = point.encode() parameters[index] = json.dumps(params) audio_exists[index] = False if index % 1000 == 0: print("Generating parameters for example {}".format(index)) datafile.flush() datafile.close() return filename
def write_array(file: IOBase, component: str, array: Array): # TODO : More validation on the inputs? group = get_write_group(file, component) for dataset in group: del group[dataset] group.create_dataset("array", data=array.data, track_times=False) if array.dimensions is not None: for i, dimension in enumerate(array.dimensions, start=1): if dimension.title is not None: group.create_dataset( f"Dimension_{i}_title", dtype=h5py.string_dtype(), shape=(), data=dimension.title, track_times=False, ) if dimension.names is not None: encoded_names = np.char.encode(dimension.names) group.create_dataset( f"Dimension_{i}_names", dtype=h5py.string_dtype(), shape=encoded_names.shape, data=encoded_names, track_times=False, ) if dimension.values is not None: values = np.array(dimension.values) group.create_dataset( f"Dimension_{i}_values", dtype=values.dtype, shape=values.shape, data=values, track_times=False, ) if dimension.units is not None: group.create_dataset( f"Dimension_{i}_units", dtype=h5py.string_dtype(), shape=(), data=dimension.units, track_times=False, ) if array.units is not None: group.create_dataset( "units", dtype=h5py.string_dtype(), shape=(), data=array.units, track_times=False, )
def main(): with h5py.File(file_path, 'w') as f: dataset = f.create_dataset('/group/dataset', shape=(3, 4), dtype='i') dataset[:] = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]] dataset.attrs['double'] = math.pi hello = '早上好!' dataset.attrs['string-vlen'] = hello hello_utf8 = hello.encode('utf-8') hello_ascii = 'Hello, world!' dataset.attrs.create('string-ascii', hello_ascii, None, '<S{0}'.format(len(hello_ascii))) utf8_type = h5py.string_dtype('utf-8', len(hello_utf8)) # HDFView can not display the value of this attribute correctly, ViTables can. dataset.attrs.create('string', hello_utf8, None, utf8_type) dataset.attrs['boolean'] = True color_dt = h5py.enum_dtype({ "RED": 0, "GREEN": 1, "BLUE": 42 }, basetype='i') dataset.attrs.create('color', 42, dtype=color_dt)
def write(self, file_name): """ write the info to a file. if the universe has been processed, this information is also written :param file_name: :returns: :rtype: """ dt = h5py.string_dtype(encoding="utf-8") with h5py.File(file_name, "w") as f: f.attrs["n_grbs"] = self._n_grbs f.attrs["is_processed"] = self._is_processed f.attrs["population_file"] = self._population_file grbs = f.create_dataset( "grb_saves", data=np.array(self._grb_save_files, dtype=dt) ) if self._is_processed: grb_dets = f.create_dataset( "grb_dets", data=np.array(self._grb_detector_files, dtype=dt) )
def flush(self): if self._writable: # only write `_NCProperties` in newly created files if not self._preexisting_file and not self.invalid_netcdf: _NC_PROPERTIES = "version=2,h5netcdf=%s,hdf5=%s,%s=%s" % ( __version__, self._h5py.version.hdf5_version, self._h5py.__name__, self._h5py.__version__, ) self.attrs._h5attrs["_NCProperties"] = np.array( _NC_PROPERTIES, dtype=h5py.string_dtype( encoding="ascii", length=len(_NC_PROPERTIES) ), ) if self.invalid_netcdf: # see https://github.com/h5netcdf/h5netcdf/issues/165 # warn user if .nc file extension is used for invalid netcdf features if os.path.splitext(self.filename)[1] == ".nc": msg = ( f"You are writing invalid netcdf features to file " f"`{self.filename}`. The file will thus be not conforming " f"to NetCDF-4 standard and might not be readable by other " f"netcdf tools. Consider using a different extension." ) warnings.warn(msg, UserWarning, stacklevel=2) # remove _NCProperties if invalid_netcdf if exists if "_NCProperties" in self.attrs._h5attrs: del self.attrs._h5attrs["_NCProperties"]
def save(self, save_filename=None): # ************************* # *** Save data to HDF5 *** # ************************* if save_filename is None: script_path = os.path.realpath(__file__) # full path of current script current_dir, script_basename = os.path.split(script_path) script_filename = os.path.splitext(script_basename)[0] # name of current script timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) # current date and time save_basename = f"{script_filename:s}_{timestamp:s}.h5" # name of save file save_path = os.path.join(current_dir, "data", save_basename) # full path of save file else: save_path = os.path.realpath(save_filename) source_code = get_sourcecode(__file__) # save also the sourcecode of the script for future reference with h5py.File(save_path, "w") as h5f: dt = h5py.string_dtype(encoding='utf-8') ds = h5f.create_dataset("source_code", (len(source_code), ), dt) for ii, line in enumerate(source_code): ds[ii] = line for attribute in self.__dict__: print(f"{attribute}: {self.__dict__[attribute]}") if attribute.startswith("_"): # don't save private attributes continue if attribute == "jpa_params": h5f.attrs[attribute] = str(self.__dict__[attribute]) elif np.isscalar(self.__dict__[attribute]): h5f.attrs[attribute] = self.__dict__[attribute] else: h5f.create_dataset(attribute, data=self.__dict__[attribute]) print(f"Data saved to: {save_path}") return save_path
def _create_dataset(file_handle, ds_name, data): data = _convert_list(data) try: if issubclass(data.dtype.type, bytes): data = np.void(data) # byte strings aren't handled properly dataset = \ file_handle.create_dataset(ds_name, maxshape=(None,) + data.shape[1:], data=data) except TypeError: if issubclass(data.dtype.type, str): dtype = h5py.string_dtype(encoding='utf-8') else: raise TypeError dataset = file_handle.create_dataset(ds_name, shape=data.shape, maxshape=(None, ) + data.shape[1:], dtype=dtype) dataset[:] = data return dataset
def _save_validation_results_hdf(self, outfile, results): attrib_data = ['model_id', 'timestamp'] attrib_data += 'f1_weighted recall_weighted precision_weighted f1_macro recall_macro precision_macro'.split( ) int_data = [ 'input_classes', 'output_classes' ] + 'counts_perclass val_counts_perclass train_counts_perclass'.split( ) int_data.extend([ 'classes_by_' + stat for stat in 'f1 recall precision count'.split() ]) string_data = ['class_labels', 'image_fullpaths', 'image_basenames'] with h5.File(outfile, 'w') as f: meta = f.create_dataset('metadata', data=h5.Empty('f')) for series in results: if series in attrib_data: meta.attrs[series] = results[series] elif series in string_data: f.create_dataset(series, data=np.string_(results[series]), compression='gzip', dtype=h5.string_dtype()) elif series in int_data: f.create_dataset(series, data=results[series], compression='gzip', dtype='int16') elif isinstance(results[series], np.ndarray): f.create_dataset(series, data=results[series], compression='gzip', dtype='float16') else: raise UserWarning( 'hdf results: WE MISSED THIS ONE: {}'.format(series))
def save_collection(self, dest: Path) -> None: """Save the collection (queries and documents). Use the unique integer IDs for queries and documents. The original IDs can be recovered through a mapping that is also saved. Args: dest (Path): The file to create. """ str_dt = h5py.string_dtype(encoding="utf-8") with h5py.File(dest, "w") as fp: ds = { "queries": fp.create_dataset("queries", (len(self.queries), ), dtype=str_dt), "orig_q_ids": fp.create_dataset("orig_q_ids", (len(self.orig_q_ids), ), dtype=str_dt), "docs": fp.create_dataset("docs", (len(self.docs), ), dtype=str_dt), "orig_doc_ids": fp.create_dataset("orig_doc_ids", (len(self.orig_doc_ids), ), dtype=str_dt), } for q_id, query in tqdm(self.queries.items(), desc="Saving queries"): ds["queries"][q_id] = query ds["orig_q_ids"][q_id] = self.orig_q_ids[q_id] for doc_id, doc in tqdm(self.docs.items(), desc="Saving documents"): ds["docs"][doc_id] = doc ds["orig_doc_ids"][doc_id] = self.orig_doc_ids[doc_id]
def to_hdf(self, hdf, group_name="structures"): # truncate arrays to necessary size before writing self._resize_atoms(self.num_atoms) self._resize_structures(self.num_structures) with hdf.open(group_name) as hdf_s_lst: self._type_to_hdf(hdf_s_lst) hdf_s_lst["num_atoms"] = self._num_atoms_alloc hdf_s_lst["num_structures"] = self._num_structures_alloc hdf_arrays = hdf_s_lst.open("arrays") for k, a in chain(self._per_atom_arrays.items(), self._per_structure_arrays.items()): if a.dtype.char == "U": # numpy stores unicode data in UTF-32/UCS-4, but h5py wants UTF-8, so we manually encode them here # TODO: string arrays with shape != () not handled hdf_arrays[k] = np.array( [s.encode("utf8") for s in a], # each character in a utf8 string might be encoded in up to 4 bytes, so to # make sure we can store any string of length n we tell h5py that the # string will be 4 * n bytes; numpy's dtype does this calculation already # in itemsize, so we don't need to repeat it here # see also https://docs.h5py.org/en/stable/strings.html dtype=h5py.string_dtype('utf8', a.dtype.itemsize)) else: hdf_arrays[k] = a
def tokenize(self, input_hdf5_group: str, input_hdf5_dataset: str, output_hdf5_group: str, output_hdf5_dataset_tokenized: str, output_hdf5_dataset_tokenized_id: str) -> None: with h5py.File(self.hdf5_path, "a") as hdf5_store: hdf5_group = hdf5_store.get(input_hdf5_group) captions = numpy.array(hdf5_group[input_hdf5_dataset]) captions_tokenized = [] captions_tokenized_id = [] for caption in tqdm(captions): caption_tokenized = ( self.tokenizer.encode_with_bos_eos(caption)) caption_tokenized_id = ( self.tokenizer.encode_ids_with_bos_eos(caption)) captions_tokenized.append(caption_tokenized) captions_tokenized_id.append(caption_tokenized_id) if output_hdf5_dataset_tokenized in hdf5_group.keys(): del hdf5_group[output_hdf5_dataset_tokenized] if output_hdf5_dataset_tokenized_id in hdf5_group.keys(): del hdf5_group[output_hdf5_dataset_tokenized_id] hdf5_group.create_dataset( output_hdf5_dataset_tokenized, data=numpy.array(captions_tokenized, dtype=h5py.string_dtype(encoding="utf-8"))) token_id_dataset = hdf5_group.create_dataset( output_hdf5_dataset_tokenized_id, shape=(len(captions_tokenized_id), ), dtype=h5py.vlen_dtype(numpy.dtype("int32"))) token_id_dataset[...] = captions_tokenized_id
def test_fixed_ascii(self): dt = h5py.string_dtype(encoding='ascii', length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def normalize_whitespace(self, input_hdf5_group: str, input_hdf5_dataset: str, output_hdf5_group: str, output_hdf5_dataset: str) -> None: self.logger.info("Normalizing whitespace. Data transfer:\n" + f"\"{input_hdf5_group}/{input_hdf5_dataset}\" -> " + f"\"{output_hdf5_group}/{output_hdf5_dataset}\".") self.regex_substitution(self.whitespace_regex, self.whitespace_placeholder, input_hdf5_group, input_hdf5_dataset, output_hdf5_group, output_hdf5_dataset) with h5py.File(self.hdf5_path, "a") as hdf5_store: captions = numpy.array( hdf5_store.get(input_hdf5_group).get(input_hdf5_dataset)) captions_cleaned = [] for caption in captions: caption_cleaned = caption.strip() captions_cleaned.append(caption_cleaned) output_group = hdf5_store.require_group(output_hdf5_group) if output_hdf5_dataset in output_group.keys(): del output_group[output_hdf5_dataset] output_group.create_dataset( output_hdf5_dataset, data=numpy.array(captions_cleaned, dtype=h5py.string_dtype(encoding="utf-8")))
def test_vlen_ascii(self): dt = h5py.string_dtype(encoding='ascii') string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is bytes
def test_fixed_utf8(self): dt = h5py.string_dtype(length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def create_files_and_datasets(params, samples_folder): """ Function to create the hdfs files (trn, val and tst). :param params: (dict) Parameters found in the yaml config file. :param samples_folder: (str) Path to the output folder. :return: (hdf5 datasets) trn, val ant tst datasets. """ samples_size = params['global']['samples_size'] number_of_bands = params['global']['number_of_bands'] meta_map = get_key_def('meta_map', params['global'], {}) real_num_bands = number_of_bands - MetaSegmentationDataset.get_meta_layer_count( meta_map) assert real_num_bands > 0, "invalid number of bands when accounting for meta layers" hdf5_files = [] for subset in ["trn", "val", "tst"]: hdf5_file = h5py.File( os.path.join(samples_folder, f"{subset}_samples.hdf5"), "w") hdf5_file.create_dataset( "sat_img", (0, samples_size, samples_size, real_num_bands), np.float32, maxshape=(None, samples_size, samples_size, real_num_bands)) hdf5_file.create_dataset("map_img", (0, samples_size, samples_size), np.int16, maxshape=(None, samples_size, samples_size)) hdf5_file.create_dataset("meta_idx", (0, 1), dtype=np.int16, maxshape=(None, 1)) hdf5_file.create_dataset("metadata", (0, 1), dtype=h5py.string_dtype(), maxshape=(None, 1)) hdf5_files.append(hdf5_file) return hdf5_files
def test_vlen_utf8(self): dt = h5py.string_dtype() string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is str
def preproc(self): ''' proprocess data ''' if self.source != 'loseit': raise NotImplementedError import glob import pandas as pd fweeks = glob.glob(os.path.join(dat_dir, 'WeeklySummary*.csv')) for i, fweek in enumerate(fweeks): week = pd.read_csv(fweek) if i == 0: cols = week.columns data = [np.array(week[col]) for col in cols] else: assert np.array_equal(np.array(week.columns), np.array(cols)) for i in range(len(week.columns)): data[i] = np.concatenate([data[i], week[cols[i]]]) # save to hdf5 h5 = h5py.File(os.path.join(dat_dir, '%s.hdf5' % self.source), 'w') # no meta data for now for i, col in enumerate(cols): if isinstance(data[i][0], str): h5.create_dataset(col, data=data[i], dtype=h5py.string_dtype()) else: h5.create_dataset(col, data=data[i]) h5.close() return None
def test_fixed_utf8(self): dt = h5py.string_dtype(length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def create_h5(df, hdf5name): # Exception occurs when files already exist. with h5py.File(hdf5name, 'w-') as f: dt_string = h5py.string_dtype() dset_fullname_creator = f.create_dataset('fullname_creator', (len(df.index),), dtype=dt_string) dset_material = f.create_dataset('material', (len(df.index),), dtype=dt_string) dset_type = f.create_dataset('type', (len(df.index),), dtype=dt_string) dset_fullname_creator_cat = f.create_dataset('fullname_creator_cat', (len(df.index),), dtype=dt_string) dset_material_cat = f.create_dataset('material_cat', (len(df.index),), dtype=dt_string) dset_type_cat = f.create_dataset('type_cat', (len(df.index),), dtype=dt_string) dt_uint8 = h5py.special_dtype(vlen=np.dtype('uint8')) dset_img = f.create_dataset('images', (len(df.index),), dtype=dt_uint8) for i, r in df.iterrows(): filename = str(r['filename']) print(f'[{i}]: {filename}') with open(images_path+filename, 'rb') as fin: dset_img[i] = np.frombuffer(fin.read(), dtype='uint8') dset_fullname_creator[i] = r['fullname_creator'] dset_fullname_creator_cat[i] = r['fullname_creator_cat'] dset_material[i] = r['material'] dset_material_cat[i] = r['material_cat'] dset_type[i] = r['type'] dset_type_cat[i] = r['type_cat'] print('Done')
def test_fixed_ascii(self): dt = h5py.string_dtype(encoding='ascii', length=10) string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length == 10 assert h5py.check_vlen_dtype(dt) is None
def run(self) -> None: # TODO: check whether cache exists self.logger.info(f"Tokenizing caption data.") with h5py.File(self.hdf5_path, "a") as hdf5_store: for hdf5_group_name in self.raw_data_group_names.values(): hdf5_group = hdf5_store.get(hdf5_group_name) captions = numpy.array(hdf5_group["caption_cleaned"]) captions_tokenized = [] captions_tokenized_id = [] for caption in captions: caption_tokenized = ( self.tokenizer.encode_with_bos_eos(caption)) caption_tokenized_id = ( self.tokenizer.encode_ids_with_bos_eos(caption)) captions_tokenized.append(caption_tokenized) captions_tokenized_id.append(caption_tokenized_id) if "caption_cleaned_tokenized" in hdf5_group.keys(): del hdf5_group["caption_cleaned_tokenized"] if "caption_cleaned_tokenized_id" in hdf5_group.keys(): del hdf5_group["caption_cleaned_tokenized_id"] hdf5_group.create_dataset( "caption_cleaned_tokenized", data=numpy.array( captions_tokenized, dtype=h5py.string_dtype(encoding="utf-8"))) token_id_dataset = hdf5_group.create_dataset( "caption_cleaned_tokenized_id", shape=(len(captions_tokenized_id), ), dtype=h5py.vlen_dtype(numpy.dtype("int32"))) token_id_dataset[...] = captions_tokenized_id
def get_result(self, discard=True): '''Get the result associated with this future, blocking until it is available. If ``discard`` is true, then removes the reference to the result contained in this instance, so that a collection of futures need not turn into a cache of all associated results.''' with self._condition: if self._done: if self._exception: if isinstance(self._traceback, h5py.string_dtype(encoding='utf-8')): if self._traceback: log.error( 'uncaught exception in remote function\n{}'. format(self._traceback)) raise self._exception else: raise self._exception.with_traceback(self._traceback) else: self._condition.wait() assert self._done if self._exception: if isinstance(self._traceback, str): log.error( 'uncaught exception in remote function\n{}'.format( self._traceback)) raise self._exception else: raise self._exception.with_traceback(self._traceback) result = self._result if discard: del self._result return result
def test_bytestr(self): """ Indexing a byte string dataset returns a real python byte string """ dset = self.f.create_dataset('x', (1, ), dtype=h5py.string_dtype(encoding='ascii')) dset[0] = b"Hello there!" self.assertEqual(type(dset[0]), bytes)
def test_vlen_utf8(self): dt = h5py.string_dtype() string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'utf-8' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is six.text_type
def test_vlen_ascii(self): dt = h5py.string_dtype(encoding='ascii') string_info = h5py.check_string_dtype(dt) assert string_info.encoding == 'ascii' assert string_info.length is None assert h5py.check_vlen_dtype(dt) is bytes
def test_compound(self): fields = [] fields.append(('field_1', h5py.string_dtype())) fields.append(('field_2', np.int32)) dt = np.dtype(fields) self.f['mytype'] = np.dtype(dt) dt_out = self.f['mytype'].dtype.fields['field_1'][0] string_inf = h5py.check_string_dtype(dt_out) self.assertEqual(string_inf.encoding, 'utf-8')
def test_vlen_string_array(self): """ Storage of vlen byte string arrays""" dt = h5py.string_dtype(encoding='ascii') data = np.ndarray((2,), dtype=dt) data[...] = b"Hello", b"Hi there! This is HDF5!" self.f.attrs['x'] = data out = self.f.attrs['x'] self.assertEqual(out.dtype, dt) self.assertEqual(out[0], data[0]) self.assertEqual(out[1], data[1])