def pump_group_paths(src_file: H5File) -> Generator[str, None, None]: """Yields paths to each existing pump/nopump group under each round Args: src_file (H5File): The file containing the experimental data Note: An easier way to generate these paths would be to collect a list of paths to all items in a file using the `visit` method, then keep only those paths that end with "pump". However, for large files it can take several seconds just to iterate through all of those items, so for the sake of speed we generate each possible path (not including the faulty ones) and just check to make sure it corresponds to a real group before yielding the path. Yields: Paths to pump/nopump subgroups of each wavelength under each round """ rounds_root = src_file['rounds'] rounds = sorted(subgroups(rounds_root)) wavelengths = sorted(subgroups(rounds_root[rounds[0]])) for rnd in rounds: for wav in wavelengths: for pump in ['pump', 'nopump']: path = f'/rounds/{rnd}/{wav}/{pump}' try: src_file[path] except KeyError: continue yield path
def map_rounds_datasets(old_file: H5File, new_file: H5File, func: Callable[[Group, Group], None]) -> None: """Visits each wavelength group in each round and applies a function mapping datasets in the old group to datasets in the new group Note: The mapping function must have the following signature: func(old_group: Group, new_group: Group) -> None Note: Not covered by tests yet Args: old_file (H5File): The file containing the original data new_file (H5File): The file to store the mapped data into func (Callable): The function that will be used to map data from the old file to the new file """ old_rounds_root = old_file['rounds'] new_rounds_root = new_file['rounds'] for rnd_name in subgroups(old_rounds_root): for wav_name in subgroups(old_rounds_root[rnd_name]): wav_path = f'{rnd_name}/{wav_name}' old_group = old_rounds_root[wav_path] new_group = new_rounds_root[wav_path] func(old_group, new_group) copy_all_attributes(old_file, new_file) return
def make_top_level_wavelengths_group(organized_file: H5File) -> None: """Takes the newly reorganized data and creates a new top level group that divides the data by wavelength rather than round. The structure of the new wavelength groups will be as follows: File wavelengths WWWWW roundX pump nopump Note that these new groups and datasets are simply hard links, not new copies of the data, so no space penalty is incurred by adding a new organizational structure like this. Args: organized_file (H5File): A file that has already had its data reorganized into rounds """ logger.info(f'Assembling a top level \'wavelengths\' group in {organized_file.filename}') organized_file.create_group('wavelengths') wav_root = organized_file['wavelengths'] all_wavelengths = wavelength_set(organized_file) for wav_name in all_wavelengths: wav_root.create_group(wav_name) rounds_root = organized_file['rounds'] for rnd_name in subgroups(rounds_root): for wav_name in subgroups(rounds_root[rnd_name]): wav_root[wav_name].create_group(rnd_name) old_wav_path = f'{rnd_name}/{wav_name}' for pump_name in subgroups(rounds_root[old_wav_path]): old_pump_path = f'{rnd_name}/{wav_name}/{pump_name}' new_pump_path = f'{wav_name}/{rnd_name}/{pump_name}' wav_root[new_pump_path] = rounds_root[old_pump_path] logger.info(f'Done creating \'wavelengths\' group') return
def test_copy_rounds_structure_for_delta_a(organized_clean_data: H5File, starts_empty: H5File): """Ensures that the groups in the organized data file are correctly copied to an empty file that will later store dA data """ reorg.copy_rounds_structure_for_delta_a(organized_clean_data, starts_empty) rounds_root = organized_clean_data['rounds'] for rnd in subgroups(rounds_root): for wav in subgroups(rounds_root[rnd]): path = f'rounds/{rnd}/{wav}' try: starts_empty[path] except KeyError: fail(f'Path not copied into new file: {path}')
def wavelength_set(organized_file: H5File) -> Set[str]: """Walks a reorganized raw-data or dA file and determines the set of all wavelengths present in the file Args: organized_file (H5File): A file whose data has already been organized into rounds Returns: The set of wavelengths present in the file """ rounds_root = organized_file['rounds'] wavelengths = set() for rnd_name in subgroups(rounds_root): rnd_wavelengths = subgroups(rounds_root[rnd_name]) wavelengths.update(rnd_wavelengths) return wavelengths
def copy_rounds_structure_for_delta_a(old_file: H5File, new_file: H5File) -> None: """Copies the File/roundX/WWWWW group structure into a new, empty file for storing dA data in at a later time Args: old_file (H5File): The file whose structure will be copied new_file (H5File): An empty file that will have a group structure copied into it """ rounds_root = old_file['rounds'] for rnd in subgroups(rounds_root): for wav in subgroups(rounds_root[rnd]): path = f'rounds/{rnd}/{wav}' new_file.require_group(path) return
def combine_files(filenames: List[str], joined_name: str) -> None: """Combines multiple small files into one large file with the rounds renamed Args: filenames (List[str]): The list of filenames to combine joined_name (str): The name of the file to store the merged data in i.e. 'joined.h5' Note: Not covered by tests yet Warning: This function recombines files that have already been restructured. Do not use this on raw data files. """ logger.info(f'Combining {filenames} into {joined_name}') new_file = h5py.File(joined_name, 'w', libver='latest') new_file.create_group('rounds') new_rounds_root = new_file['rounds'] rnd_counter = 0 for filename in filenames: logger.debug(f'Copying {filename} into {joined_name}') old_file = h5py.File(filename, 'r', libver='latest') old_rounds_root = old_file['rounds'] for old_rnd_name in sorted(subgroups(old_rounds_root)): new_rnd_name = f'round{rnd_counter:0>3d}' new_rounds_root.create_group(new_rnd_name) recursive_copy(old_rounds_root[old_rnd_name], new_rounds_root[new_rnd_name]) rnd_counter += 1 old_file.close() new_file.close() return
def test_creating_wavelengths_top_level_group(organized_clean_data: H5File): """Ensures that a wavelength group is created for each wavelength present in the file """ reorg.make_top_level_wavelengths_group(organized_clean_data) wav_root = organized_clean_data['wavelengths'] wavelengths_present = subgroups(wav_root) assert set(wavelengths_present) == {'76487', '76715'}
def export_txt_data(data_group: Group, sample_name: str, folder_name: str, include_cd: bool = False) -> None: # noqa """Point this function at a group containing wavelength groups to export the data as CSV files. A folder with the specified name is created in the working directory, and under that folder a folder is created for each signal ('perp', 'par', and optionally 'cd'). The folder hierarchy is thus: <folder_name> |---perp |---par |---cd The data corresponding to each channel is placed in its respective folder. The filenames are formatted as follows: <sample_name>-<wavelength>-<signal>.txt Note that although the data is saved in CSV format, the file extension is `.txt`. This is for compatibility with Spectra Solve, which expects comma-delimitted data to have a `.txt` extension (for some reason). Args: data_group (Group): The group containing the data to export sample_name (str): A sample name to include in the filename of each piece of data folder_name (str): The name of the folder in which to store the exported data include_cd (bool): Indicates whether to export CD data """ data_dir = Path.cwd() / folder_name if data_dir.exists() and data_dir.is_dir(): logger.error( f'Directory {data_dir} already exists, please choose another name') return data_dir.mkdir() signals = ['perp', 'par'] if include_cd: signals.append('cd') signal_dirs = { 'perp': data_dir / 'perp', 'par': data_dir / 'par', } if include_cd: signal_dirs['cd'] = data_dir / 'cd' for sig_dir in signal_dirs.values(): sig_dir.mkdir() wavelengths = subgroups(data_group) for wav in wavelengths: wav_group = data_group[wav] time_data = wav_group['time'][...] for sig in signals: sig_data = wav_group[sig][...] two_column_data = np.column_stack((time_data, sig_data)) file_path = signal_dirs[sig] / f'{sample_name}-{wav}-{sig}.txt' logger.info( f'Saving dataset {wav_group[sig].name} to file {file_path}') with file_path.open(mode='wb') as file: np.savetxt(file, two_column_data, delimiter=',') return
def test_subgroups(clean_raw_data): """Ensures that the correct subgroup names are reported Args: clean_raw_data (H5File): An HDF5 file containing clean test data """ subgroup_names = subgroups(clean_raw_data) assert subgroup_names == ['spectrum1', 'spectrum2', 'spectrum3']
def recursive_copy(old_parent: Group, new_parent: Group) -> None: """Copies the contents of the old parent group to the new parent group Args: old_parent (Group): The group whose contents will be copied new_parent (Group): The group that will be copied into """ if len(subgroups(old_parent)) == 0: for dset_name in datasets(old_parent): new_parent.create_dataset(dset_name, data=old_parent[dset_name][...], dtype=np.float32) return for group_name in subgroups(old_parent): new_parent.create_group(group_name) recursive_copy(old_parent[group_name], new_parent[group_name]) return
def create_wavelength_groups_for_all_rounds(old_file: H5File, new_file: H5File) -> None: """Creates the subgroups in the new file for each wavelength present in a round Args: old_file (H5File): The original file with the experimental data new_file (H5File): The new file in which the data will be organized """ rounds_root = new_file['rounds'] rounds = sorted(subgroups(rounds_root)) spectra = sorted(subgroups(old_file)) spectra = [s for s in spectra if s != 'experiment_parameters'] spectrum_map = make_spectrum_map(spectra) spectra_for_zip = sorted(spectrum_map.keys()) for spec, rnd in zip(spectra_for_zip, rounds): real_spec_name = spectrum_map[spec] create_wavelength_groups_for_one_round(old_file[real_spec_name], rounds_root[rnd]) return
def copy_datasets(old_file: H5File, new_file: H5File) -> None: """Walks the top level groups in the old file to copy datasets to the new file, dropping the last column in each dataset and renaming the dataset to either "pump" or "nopump" Args: old_file (H5File): The original file with the experimental data new_file (H5File): The new file in which the data will be organized """ rounds_root = new_file['rounds'] rounds = sorted(subgroups(rounds_root)) spectra = [s for s in subgroups(old_file) if s != 'experiment_parameters'] spectrum_map = make_spectrum_map(spectra) spectra_for_zip = sorted(spectrum_map.keys()) for spec, rnd in zip(spectra_for_zip, rounds): real_spec_name = spectrum_map[spec] dataset_names = datasets(old_file[real_spec_name]) pairs = sorted(pair_dataset_names(dataset_names)) for wav, *dset_names in pairs: if len(dset_names) < 2: dset_old_path = f'/{real_spec_name}/{dset_names[0]}' dset = old_file[dset_old_path] dset_new_path = f'/rounds/{rnd}/{wav}/faulty1' new_file.create_group(dset_new_path) logger.debug(f'Copying dataset {dset_old_path} to {dset_new_path}') split_and_store_old_dataset(new_file[dset_new_path], dset) continue dset1_name = dset_names[0] dset2_name = dset_names[1] dset1_old_path = f'/{real_spec_name}/{dset1_name}' dset2_old_path = f'/{real_spec_name}/{dset2_name}' dset1 = old_file[dset1_old_path] dset2 = old_file[dset2_old_path] dset1_new_path, dset2_new_path = round_path_for_old_dataset(dset1, dset2, rnd, wav) new_file.create_group(dset1_new_path) new_file.create_group(dset2_new_path) new_group1 = new_file[dset1_new_path] new_group2 = new_file[dset2_new_path] logger.debug(f'Copying dataset {dset1_old_path} to {dset1_new_path}') split_and_store_old_dataset(new_group1, dset1) logger.debug(f'Copying dataset {dset2_old_path} to {dset2_new_path}') split_and_store_old_dataset(new_group2, dset2) return
def test_create_wavelength_groups_under_all_rounds(clean_raw_data, has_rounds_groups): """Ensures that all of the wavelength groups are created correctly in the new file """ reorg.create_wavelength_groups_for_all_rounds(clean_raw_data, has_rounds_groups) rounds = ['round001', 'round002', 'round003'] for rnd_name in rounds: wavelengths_created = subgroups(has_rounds_groups[f'rounds/{rnd_name}']) assert len(wavelengths_created) == 2 assert '76487' in wavelengths_created assert '76715' in wavelengths_created
def test_create_wavelength_groups_under_one_round(clean_raw_data, starts_empty): """Ensures that all of the wavelengths present in a given spectrum of the original file end up in the new file """ old_group: Group = clean_raw_data['spectrum1'] reorg.create_wavelength_groups_for_one_round(old_group, starts_empty) # Note that `starts_empty` is no longer empty at this point wavelength_groups_created = subgroups(starts_empty) assert len(wavelength_groups_created) == 2 for wav in ['76487', '76715']: assert wav in wavelength_groups_created
def test_correct_rounds_created_in_starts_empty(clean_raw_data, starts_empty): """Ensures that `spectrum1` gets created in the empty file as `round001`. Args: clean_raw_data (H5File): An HDF5 file containing clean test data starts_empty (H5File): An empty file for testing the creation of rounds """ reorg.create_rounds_from_spectra(clean_raw_data, starts_empty) rounds = subgroups(starts_empty['rounds']) for rnd in ['round001', 'round002', 'round003']: assert rnd in rounds
def wavelengths_under_rounds_paths( src_file: H5File) -> Generator[str, None, None]: """Yields paths to each existing wavelength group under each round Args: src_file (H5File): The file containing the experimental data Yields: Paths to wavelength subgroups of each round """ rounds_root = src_file['rounds'] rounds = sorted(subgroups(rounds_root)) wavelengths = sorted(subgroups(rounds_root[rounds[0]])) for rnd in rounds: for wav in wavelengths: path = f'/rounds/{rnd}/{wav}' try: src_file[path] except KeyError: continue yield path
def test_restructure_and_merge_renumbers_correctly(filenames_for_merging): """Verifies that the rounds are renumbered correctly during the merge. """ restructure_and_merge(filenames_for_merging, joined_name='joined.h5') merged = h5py.File('joined.h5', 'r', libver='latest') renamed_rounds = subgroups(merged['rounds']) merged.close() remove('joined.h5') assert len(renamed_rounds) == 6 for num in range(6): expected_round = f'round{num:0>3d}' assert expected_round in renamed_rounds
def print_missing(filenames: List[str]) -> None: """Walks the file looking for wavelength groups that don't have any pump or faulty subgroups Args: filenames (List[str]): The list of filenames to inspect """ missing = [] for filename in filenames: data = h5py.File(filename, 'r') rounds_root = data['rounds'] for rnd_name in subgroups(rounds_root): for wav_name in subgroups(rounds_root[rnd_name]): wav_group = rounds_root[rnd_name + '/' + wav_name] contents = subgroups(wav_group) if len(contents) == 0: path = data.filename[0:-3] + ': ' + wav_group.name missing.append(path) data.close() print(f'Number of missing files: {len(missing)}') for path in missing: print(path) return
def all_signal_dataset_paths(src_file: H5File) -> Generator[str, None, None]: """Yields paths to each existing perp/par/ref dataset in a pump/nopump group Args: src_file (H5File): The file with the experiment data Yields: Paths to each dataset in the file that isn't a time dataset """ rounds_root = src_file['rounds'] rounds = sorted(subgroups(rounds_root)) wavelengths = sorted(subgroups(rounds_root[rounds[0]])) for rnd in rounds: for wav in wavelengths: for pump in ['pump', 'nopump']: for channel in ['perp', 'par', 'ref']: path = f'/rounds/{rnd}/{wav}/{pump}/{channel}' try: src_file[path] except KeyError: continue yield path
def make_delta_a_wavelength_groups(src_file: H5File) -> None: """Takes the newly reorganized data and creates a new top level group that divides the data by wavelength rather than round. The structure of the new wavelength groups will be as follows: File wavelengths WWWWW roundX pump nopump Note that these new groups and datasets are simply hard links, not new copies of the data, so no space penalty is incurred by adding a new organizational structure like this. Note: Not covered by tests yet Args: src_file (H5File): A file that has already had its data reorganized into rounds """ src_file.create_group('wavelengths') wav_root = src_file['wavelengths'] all_wavelengths = wavelength_set(src_file) for wav_name in all_wavelengths: wav_root.create_group(wav_name) rounds_root = src_file['rounds'] for rnd_name in subgroups(rounds_root): for wav_name in subgroups(rounds_root[rnd_name]): wav_root[wav_name].create_group(rnd_name) old_time_path = f'{rnd_name}/{wav_name}/time' old_perp_path = f'{rnd_name}/{wav_name}/perp' old_par_path = f'{rnd_name}/{wav_name}/par' new_time_path = f'{wav_name}/{rnd_name}/time' new_perp_path = f'{wav_name}/{rnd_name}/perp' new_par_path = f'{wav_name}/{rnd_name}/par' wav_root[new_time_path] = rounds_root[old_time_path] wav_root[new_perp_path] = rounds_root[old_perp_path] wav_root[new_par_path] = rounds_root[old_par_path] return
def paths_for_signal(src_file: H5File, chan: DeltaAChannel) -> Generator[str, None, None]: """Yields the paths to each dataset corresponding to the specified signal Args: src_file (H5File): The file containing the experiment data chan (DeltaAChannel): The channel whose paths will be produced Yields: Paths to each of the datasets of the specified signal """ rounds_root = src_file['rounds'] rounds = sorted(subgroups(rounds_root)) wavelengths = sorted(subgroups(rounds_root[rounds[0]])) for rnd in rounds: for wav in wavelengths: path = f'/rounds/{rnd}/{wav}/{chan.name}' try: src_file[path] except KeyError: continue yield path
def create_rounds_from_spectra(old_file: H5File, new_file: H5File) -> None: """Creates a group 'roundX' for each top level group in the original file named 'spectrumX' Args: old_file (H5File): The original file with the experimental data new_file (H5File): The new, empty file in which the data will be organized """ spec_number_regex = re.compile(r'spectrum(\d+)') new_file.create_group('rounds') for s in subgroups(old_file): match = spec_number_regex.search(s) if match is None: continue spec_number = match.group(1) new_file['rounds'].create_group(f'round{spec_number:0>3s}') return
def channel_heatmap(file: H5File, func: Callable[[Array, Array], np.float64], channel: InputChannel, pump: PumpStatus) -> Array: """Generates a (number of rounds)x(number of wavelengths) array by visiting each wavelength group in the file. The value at each position in the array is computed by the function provided. Note: The "func" argument must have the following signature: func(time: Array, signal: Array) -> np.float64 Args: file (H5File): The file to generate the heatmap data from func (Callable): A function that will be used to compute the heatmap values channel (InputChannel): The input channel to use pump (PumpStatus): Which dataset (pump or nopump) the data should come from Returns: A (number of rounds)x(number of wavelengths) array that contains the heatmap pixels """ rounds_root = file['rounds'] rounds = subgroups(rounds_root) unsorted_wavelength_set = wavelength_set(file) wavelengths = sorted(list(unsorted_wavelength_set)) pixels = np.ndarray(shape=(len(rounds), len(wavelengths)), dtype=np.float64) for i in range(len(rounds)): for j in range(len(wavelengths)): group_path = f'{rounds[i]}/{wavelengths[j]}/{pump.value}' try: group = rounds_root[group_path] except KeyError: pixels[i, j] = 0 continue time_dset = group['time'] input_dset = group[f'{channel.value}'] points = len(group['time'][...]) # cast float32 data to float64 for computations to avoid rounding errors time_data = np.empty(points, dtype=np.float64) input_channel_data = np.empty(points, dtype=np.float64) time_dset.read_direct(time_data) input_dset.read_direct(input_channel_data) pixels[i, j] = func(time_data, input_channel_data) return pixels
def filter_faulty_groups(src_file: H5File) -> List[str]: """Returns the list of signal dataset paths contained in faulty groups Args: src_file (H5File): The file containing the experimental data Returns: The paths to all of the signal datasets in faulty groups """ wav_paths = rawnav.wavelengths_under_rounds_paths(src_file) faulty_paths = [] for wav_path in wav_paths: pump_groups = subgroups(src_file[wav_path]) if 'faulty' not in pump_groups[0]: continue for p_name in pump_groups: pump_path = f'{wav_path}/{p_name}' paths_below = [p for p in dataset_paths_below(src_file, pump_path) if 'time' not in p] faulty_paths += paths_below return faulty_paths
def dataset_paths_below(src_file: H5File, group_path: str) -> List[str]: """Returns the paths of all datasets below the provided path. One use case for this function is returning the paths to each dataset below a group that has been deemed to contain bad data Args: src_file (DataFile): The file containing the experimental data group_path (str): The path to the parent group Returns: The paths of all datasets below the parent group """ dataset_paths = [] parent_group = src_file[group_path] for g_name in subgroups(parent_group): dataset_paths += dataset_paths_below(src_file, f'{group_path}/{g_name}') for d_name in datasets(parent_group): dataset_paths.append(f'{group_path}/{d_name}') return dataset_paths