def write_to_hdf5(cache_handler: h5py.File, embed_stream: Generator[Tuple[List[str], np.ndarray], None, None]): for key, array_data in tqdm(embed_stream): group_key = '/'.join(key[:-1]) dataset_key = key[-1] group_obj = cache_handler.require_group(group_key) group_obj.create_dataset(name=dataset_key, data=array_data, dtype=np.float32)
def dumpToHdf5( self, h5_file: h5py.File, inner_path: str, progress_signal: Callable[[int], None] = lambda x: None ) -> str: progress_signal(0) try: h5_file.require_group(Path("/").joinpath(inner_path).parent.as_posix()) graph = Graph() op_writer = OpH5N5WriterBigDataset( graph=graph, h5N5File=h5_file, h5N5Path=inner_path, CompressionEnabled=False, BatchSize=1, Image=self.get_provider_slot(graph=graph), ) op_writer.progressSignal.subscribe(progress_signal) success = op_writer.WriteImage.value # reading this slot triggers the write finally: progress_signal(100)
def write_optimization_options(f: h5py.File) -> None: """ Create groups and write some default optimization settings """ # set common options g = f.require_group('optimizationOptions') g.attrs['optimizer'] = 0 # IpOpt g.attrs['retryOptimization'] = 1 g.attrs['hierarchicalOptimization'] = 1 g.attrs['numStarts'] = 1 # set IpOpt options g = f.require_group('optimizationOptions/ipopt') g.attrs['max_iter'] = 100 g.attrs['hessian_approximation'] = np.string_("limited-memory") g.attrs["limited_memory_update_type"] = np.string_("bfgs") g.attrs["tol"] = 1e-9 g.attrs["acceptable_iter"] = 1 # set ridiculously high, so only the acceptable_* options below matter g.attrs["acceptable_tol"] = 1e20 g.attrs["acceptable_obj_change_tol"] = 1e-12 g.attrs["watchdog_shortened_iter_trigger"] = 0 # set fmincon options g = f.require_group('optimizationOptions/fmincon') g.attrs['MaxIter'] = 100 g.attrs["TolX"] = 1e-8 g.attrs["TolFun"] = 0 g.attrs["MaxFunEvals"] = 1e7 g.attrs["algorithm"] = np.string_("interior-point") g.attrs["GradObj"] = np.string_("on") g.attrs["display"] = np.string_("iter") # set CERES options g = f.require_group('optimizationOptions/ceres') g.attrs['max_num_iterations'] = 100 # set toms611/SUMSL options g = f.require_group('optimizationOptions/toms611') g.attrs['mxfcal'] = 1e8
def write_parameters(file: h5py.File, rows: int, epoch: int, parameters: Sequence[Mapping[str, ParameterSet]]): param_group = file.require_group(Group.PARAMETERS) for layer in range(len(parameters)): for params in parameters[layer].values(): get_dataset(param_group.require_group(params.name), "values", rows, np.shape(params.values))[epoch] = params.values get_dataset(param_group.require_group(params.name), "gradients", rows, np.shape(params.gradients))[epoch] = params.gradients delta_values = np.reshape([delta.value for delta in params.deltas.flatten()], params.deltas.shape) get_dataset(param_group.require_group(params.name), "delta_values", rows, np.shape(delta_values))[epoch] = delta_values
def _save_hdf5(self, file: h5py.File): """ Actual implementation of HDF5 saving. Args: file: The open h5py.File to write the skeleton data to. Returns: None """ # All skeleton will be put as sub-groups in the skeleton group if "skeleton" not in file: all_sk_group = file.create_group("skeleton", track_order=True) else: all_sk_group = file.require_group("skeleton") # Write the dataset to JSON string, then store it in a string # attribute all_sk_group.attrs[self.name] = np.string_(self.to_json())
class DataExplorer: """Navigate datafile created through aggregation or processing. Valid datafile are those created by DataClassifier.consolidate_data and DataProcessor.run_process In those files the top-level refers either to: - a kind of measurement (DataClassifier.consolidate_data) - a tier of analysis (DataProcessor.run_process) """ #: Path to the file to open. path: str #: Should the file be open such as to allow to edit it. allow_edits: bool = False #: Should a brand new file be created. create_new: bool = False def open(self) -> None: """Open the underlying HDF5 file.""" mode = "w" if self.create_new else ("r+" if self.allow_edits else "r") self._file = File(self.path, mode) def close(self) -> None: """Close the underlying HDF5 file.""" if self._file: self._file.close() self._file = None def __enter__(self) -> "DataExplorer": """Open the underlying HDF5 file when used as a context manager.""" self.open() return self def __exit__(self, exc_type, exc_value, traceback) -> None: """Close the underlying HDF5 file when used as a context manager.""" self.close() def list_top_level(self) -> List[str]: """List the top level groups (measurement or tier).""" if not self._file: raise RuntimeError("No opened datafile") return list(self._file.keys()) def list_classifiers(self, measurement) -> Dict[int, List[str]]: """List the classifiers name by level.""" if not self._file: raise RuntimeError("No opened datafile") if measurement not in self._file: raise ValueError( f"No measurement {measurement} in opened datafile, " f"existing measurements are {self.list_top_level()}") def extract_classifiers(group: Group, classifiers: Dict[int, List[str]], level: int) -> Dict[int, List[str]]: # By construction the classifiers are the same on each level # so we only visit one level of each for entry in group.values(): if isinstance(entry, Group): classifiers[level] = list(entry.attrs) extract_classifiers(entry, classifiers, level + 1) break return classifiers return extract_classifiers(self._file[measurement], dict(), 0) def walk_data( self, measurement: str ) -> Iterator[Tuple[Dict[int, Dict[str, Any]], Group]]: """Iterate over all the data found under one top level entry. This function provides the classifiers and the group containing the datasets of interest. """ # Maximal depth of classifiers max_depth = len(self.list_classifiers(measurement)) def yield_classifier_and_data( group: Group, depth: int, classifiers: Dict[int, Dict[str, Any]] ) -> Iterator[Tuple[Dict[int, Dict[str, Any]], Group]]: # If the group has any dataset yield it and then keep going # This is relevant for processed data merged from different measurements if any(isinstance(k, Dataset) for k in group.values()): yield classifiers, group if depth == max_depth - 1: for g in [g for g in group.values() if isinstance(g, Group)]: clfs = classifiers.copy() clfs[depth] = dict(g.attrs) yield clfs, g else: for g in group.values(): clfs = classifiers.copy() clfs[depth] = dict(g.attrs) yield from yield_classifier_and_data(g, depth + 1, clfs) yield from yield_classifier_and_data(self._file[measurement], 0, dict()) def get_data(self, toplevel: str, classifiers: Dict[int, Dict[str, Any]]) -> Group: """Retrieve the group containing the datasets corresponding to the classifiers. """ known = self.list_classifiers(toplevel) if not {k: list(v) for k, v in classifiers.items()} == known: raise ValueError(f"Unknown classifiers used ({classifiers})," f" known classifiers are {known}") group = self._file[toplevel] for level, values in classifiers.items(): key = make_group_name(values) if key not in group: raise ValueError( f"No entry of level {level} found for {values}, " f"at this level known entries are {[dict(g.attrs) for g in group]}." ) group = group[key] return group def require_group(self, toplevel: str, classifiers: Dict[int, Dict[str, Any]]) -> Group: """Access the group matching the toplevel and classifiers. If any group does not exist it is created. """ # Ensure the top group is present group = self._file.require_group(toplevel) # At each classifier level check if the group exist, create it if necessary for level, values in classifiers.items(): key = make_group_name(values) if key not in group: group = group.create_group(key) group.attrs.update(values) else: group = group[key] return group
class HDF5Recorder(BaseRecorder): """ A recorder that stores data using HDF5. This format naturally handles hierarchical data and is a standard for handling large datasets. Args ---- out : str String containing the filename for the HDF5 file. **driver_kwargs Additional keyword args to be passed to the HDF5 driver. Options ------- options['record_metadata'] : bool(True) Tells recorder whether to record variable attribute metadata. options['record_unknowns'] : bool(True) Tells recorder whether to record the unknowns vector. options['record_params'] : bool(False) Tells recorder whether to record the params vector. options['record_resids'] : bool(False) Tells recorder whether to record the ressiduals vector. options['includes'] : list of strings Patterns for variables to include in recording. options['excludes'] : list of strings Patterns for variables to exclude in recording (processed after includes). """ def __init__(self, out, **driver_kwargs): super(HDF5Recorder, self).__init__() self.out = File(out, "w", **driver_kwargs) metadata_group = self.out.require_group("metadata") metadata_group.create_dataset("format_version", data=format_version) def record_metadata(self, group): """Stores the metadata of the given group in a HDF5 file using the variable name for the key. Args ---- group : `System` `System` containing vectors """ params = group.params.iteritems() resids = group.resids.iteritems() unknowns = group.unknowns.iteritems() metadata_group = self.out["metadata"] # The group metadata could be anything so need to pickle it # There are other ways of storing any kind of Python object in HDF5 but this is the simplest system_metadata_val = np.array(pickle.dumps(group.metadata, pickle.HIGHEST_PROTOCOL)) metadata_group.create_dataset("system_metadata", data=system_metadata_val) # Also store the model_viewer_data model_viewer_data = get_model_viewer_data(group) model_viewer_data_val = np.array(pickle.dumps(model_viewer_data, pickle.HIGHEST_PROTOCOL)) metadata_group.create_dataset("model_viewer_data", data=model_viewer_data_val) pairings = ( (metadata_group.create_group("Parameters"), params), (metadata_group.create_group("Unknowns"), unknowns), ) for grp, data in pairings: for key, val in data: meta_group = grp.create_group(key) for mkey, mval in iteritems(val): meta_group.create_dataset(mkey, data=mval) # if isinstance(val, (np.ndarray, Number)): # grp.create_dataset(key, data=val) # # TODO: Compression/Checksum? # else: # # TODO: Handling non-numeric data # msg = "HDF5 Recorder does not support data of type '{0}'".format(type(val)) # raise NotImplementedError(msg) def record_iteration(self, params, unknowns, resids, metadata): """ Stores the provided data in the HDF5 file using the iteration coordinate for the Group name. Args ---- params : dict Dictionary containing parameters. (p) unknowns : dict Dictionary containing outputs and states. (u) resids : dict Dictionary containing residuals. (r) metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ iteration_coordinate = metadata["coord"] group_name = format_iteration_coordinate(iteration_coordinate) f = self.out group = f.require_group(group_name) group.attrs["timestamp"] = metadata["timestamp"] group.attrs["success"] = metadata["success"] group.attrs["msg"] = metadata["msg"] pairings = [] if self.options["record_params"]: p_group = group.create_group("Parameters") pairings.append((p_group, self._filter_vector(params, "p", iteration_coordinate))) if self.options["record_unknowns"]: u_group = group.create_group("Unknowns") pairings.append((u_group, self._filter_vector(unknowns, "u", iteration_coordinate))) if self.options["record_resids"]: r_group = group.create_group("Residuals") pairings.append((r_group, self._filter_vector(resids, "r", iteration_coordinate))) for grp, data in pairings: for key, val in iteritems(data): if isinstance(val, (np.ndarray, Number)): grp.create_dataset(key, data=val) # TODO: Compression/Checksum? else: # TODO: Handling non-numeric data msg = "HDF5 Recorder does not support data of type '{0}'".format(type(val)) raise NotImplementedError(msg) def record_derivatives(self, derivs, metadata): """Writes the derivatives that were calculated for the driver. Args ---- derivs : dict Dictionary containing derivatives metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ iteration_coordinate = metadata["coord"] group_name = format_iteration_coordinate(iteration_coordinate) # get the group for the iteration iteration_group = self.out[group_name] # Create a group under that called 'deriv' deriv_group = iteration_group.require_group("Derivs") # Then add timestamp, success, msg as attributes deriv_group.attrs["timestamp"] = metadata["timestamp"] deriv_group.attrs["success"] = metadata["success"] deriv_group.attrs["msg"] = metadata["msg"] # And actual deriv data. derivs could either be a dict or an ndarray # depending on the optimizer if isinstance(derivs, np.ndarray): deriv_group.create_dataset("Derivatives", data=derivs) elif isinstance(derivs, OrderedDict): deriv_data_group = deriv_group.require_group("Derivatives") k = derivs.keys() for k, v in derivs.items(): g = deriv_data_group.require_group(k) for k2, v2 in v.items(): g.create_dataset(k2, data=v2) else: raise ValueError("Currently can only record derivatives that are ndarrays or OrderedDicts")
class HDF5Recorder(BaseRecorder): """ A recorder that stores data using HDF5. This format naturally handles hierarchical data and is a standard for handling large datasets. Args ---- out : str String containing the filename for the HDF5 file. **driver_kwargs Additional keyword args to be passed to the HDF5 driver. Options ------- options['record_metadata'] : bool(True) Tells recorder whether to record variable attribute metadata. options['record_unknowns'] : bool(True) Tells recorder whether to record the unknowns vector. options['record_params'] : bool(False) Tells recorder whether to record the params vector. options['record_resids'] : bool(False) Tells recorder whether to record the ressiduals vector. options['includes'] : list of strings Patterns for variables to include in recording. options['excludes'] : list of strings Patterns for variables to exclude in recording (processed after includes). """ def __init__(self, out, **driver_kwargs): super(HDF5Recorder, self).__init__() self.out = File(out, 'w', **driver_kwargs) metadata_group = self.out.require_group('metadata') metadata_group.create_dataset('format_version', data=format_version) def record_metadata(self, group): """Stores the metadata of the given group in a HDF5 file using the variable name for the key. Args ---- group : `System` `System` containing vectors """ params = group.params.iteritems() resids = group.resids.iteritems() unknowns = group.unknowns.iteritems() metadata_group = self.out['metadata'] # The group metadata could be anything so need to pickle it # There are other ways of storing any kind of Python object in HDF5 but this is the simplest system_metadata_val = np.array( pickle.dumps(group.metadata, pickle.HIGHEST_PROTOCOL)) metadata_group.create_dataset('system_metadata', data=system_metadata_val) # Also store the model_viewer_data model_viewer_data = get_model_viewer_data(group) model_viewer_data_val = np.array( pickle.dumps(model_viewer_data, pickle.HIGHEST_PROTOCOL)) metadata_group.create_dataset('model_viewer_data', data=model_viewer_data_val) pairings = ( (metadata_group.create_group("Parameters"), params), (metadata_group.create_group("Unknowns"), unknowns), ) for grp, data in pairings: for key, val in data: meta_group = grp.create_group(key) for mkey, mval in iteritems(val): meta_group.create_dataset(mkey, data=mval) # if isinstance(val, (np.ndarray, Number)): # grp.create_dataset(key, data=val) # # TODO: Compression/Checksum? # else: # # TODO: Handling non-numeric data # msg = "HDF5 Recorder does not support data of type '{0}'".format(type(val)) # raise NotImplementedError(msg) def record_iteration(self, params, unknowns, resids, metadata): """ Stores the provided data in the HDF5 file using the iteration coordinate for the Group name. Args ---- params : dict Dictionary containing parameters. (p) unknowns : dict Dictionary containing outputs and states. (u) resids : dict Dictionary containing residuals. (r) metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ iteration_coordinate = metadata['coord'] group_name = format_iteration_coordinate(iteration_coordinate) f = self.out group = f.require_group(group_name) group.attrs['timestamp'] = metadata['timestamp'] group.attrs['success'] = metadata['success'] group.attrs['msg'] = metadata['msg'] pairings = [] if self.options['record_params']: p_group = group.create_group("Parameters") pairings.append( (p_group, self._filter_vector(params, 'p', iteration_coordinate))) if self.options['record_unknowns']: u_group = group.create_group("Unknowns") pairings.append((u_group, self._filter_vector(unknowns, 'u', iteration_coordinate))) if self.options['record_resids']: r_group = group.create_group("Residuals") pairings.append( (r_group, self._filter_vector(resids, 'r', iteration_coordinate))) for grp, data in pairings: for key, val in iteritems(data): if isinstance(val, (np.ndarray, Number)): grp.create_dataset(key, data=val) # TODO: Compression/Checksum? else: # TODO: Handling non-numeric data msg = "HDF5 Recorder does not support data of type '{0}'".format( type(val)) raise NotImplementedError(msg) def record_derivatives(self, derivs, metadata): """Writes the derivatives that were calculated for the driver. Args ---- derivs : dict Dictionary containing derivatives metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ iteration_coordinate = metadata['coord'] group_name = format_iteration_coordinate(iteration_coordinate) # get the group for the iteration iteration_group = self.out[group_name] # Create a group under that called 'deriv' deriv_group = iteration_group.require_group('Derivs') # Then add timestamp, success, msg as attributes deriv_group.attrs['timestamp'] = metadata['timestamp'] deriv_group.attrs['success'] = metadata['success'] deriv_group.attrs['msg'] = metadata['msg'] # And actual deriv data. derivs could either be a dict or an ndarray # depending on the optimizer if isinstance(derivs, np.ndarray): deriv_group.create_dataset('Derivatives', data=derivs) elif isinstance(derivs, OrderedDict): deriv_data_group = deriv_group.require_group('Derivatives') k = derivs.keys() for k, v in derivs.items(): g = deriv_data_group.require_group(k) for k2, v2 in v.items(): g.create_dataset(k2, data=v2) else: raise ValueError( "Currently can only record derivatives that are ndarrays or OrderedDicts" )