Ejemplo n.º 1
0
def pump_group_paths(src_file: H5File) -> Generator[str, None, None]:
    """Yields paths to each existing pump/nopump group under each round

    Args:
        src_file (H5File): The file containing the experimental data

    Note:
        An easier way to generate these paths would be to collect a list of paths
        to all items in a file using the `visit` method, then keep only those paths
        that end with "pump". However, for large files it can take several seconds
        just to iterate through all of those items, so for the sake of speed we
        generate each possible path (not including the faulty ones) and just check
        to make sure it corresponds to a real group before yielding the path.

    Yields:
        Paths to pump/nopump subgroups of each wavelength under each round
    """
    rounds_root = src_file['rounds']
    rounds = sorted(subgroups(rounds_root))
    wavelengths = sorted(subgroups(rounds_root[rounds[0]]))
    for rnd in rounds:
        for wav in wavelengths:
            for pump in ['pump', 'nopump']:
                path = f'/rounds/{rnd}/{wav}/{pump}'
                try:
                    src_file[path]
                except KeyError:
                    continue
                yield path
Ejemplo n.º 2
0
def map_rounds_datasets(old_file: H5File, new_file: H5File,
                        func: Callable[[Group, Group], None]) -> None:
    """Visits each wavelength group in each round and applies a function mapping datasets in the
    old group to datasets in the new group

    Note:
        The mapping function must have the following signature:
            func(old_group: Group, new_group: Group) -> None

    Note:
        Not covered by tests yet

    Args:
        old_file (H5File): The file containing the original data
        new_file (H5File): The file to store the mapped data into
        func (Callable): The function that will be used to map data from the old file to the new file
    """
    old_rounds_root = old_file['rounds']
    new_rounds_root = new_file['rounds']
    for rnd_name in subgroups(old_rounds_root):
        for wav_name in subgroups(old_rounds_root[rnd_name]):
            wav_path = f'{rnd_name}/{wav_name}'
            old_group = old_rounds_root[wav_path]
            new_group = new_rounds_root[wav_path]
            func(old_group, new_group)
    copy_all_attributes(old_file, new_file)
    return
Ejemplo n.º 3
0
def make_top_level_wavelengths_group(organized_file: H5File) -> None:
    """Takes the newly reorganized data and creates a new top level group that divides the data
    by wavelength rather than round.

    The structure of the new wavelength groups will be as follows:
    File
        wavelengths
            WWWWW
                roundX
                    pump
                    nopump
    Note that these new groups and datasets are simply hard links, not new copies of the data,
    so no space penalty is incurred by adding a new organizational structure like this.

    Args:
        organized_file (H5File): A file that has already had its data reorganized into rounds
    """
    logger.info(f'Assembling a top level \'wavelengths\' group in {organized_file.filename}')
    organized_file.create_group('wavelengths')
    wav_root = organized_file['wavelengths']
    all_wavelengths = wavelength_set(organized_file)
    for wav_name in all_wavelengths:
        wav_root.create_group(wav_name)
    rounds_root = organized_file['rounds']
    for rnd_name in subgroups(rounds_root):
        for wav_name in subgroups(rounds_root[rnd_name]):
            wav_root[wav_name].create_group(rnd_name)
            old_wav_path = f'{rnd_name}/{wav_name}'
            for pump_name in subgroups(rounds_root[old_wav_path]):
                old_pump_path = f'{rnd_name}/{wav_name}/{pump_name}'
                new_pump_path = f'{wav_name}/{rnd_name}/{pump_name}'
                wav_root[new_pump_path] = rounds_root[old_pump_path]
    logger.info(f'Done creating \'wavelengths\' group')
    return
Ejemplo n.º 4
0
def test_copy_rounds_structure_for_delta_a(organized_clean_data: H5File,
                                           starts_empty: H5File):
    """Ensures that the groups in the organized data file are correctly copied
    to an empty file that will later store dA data
    """
    reorg.copy_rounds_structure_for_delta_a(organized_clean_data, starts_empty)
    rounds_root = organized_clean_data['rounds']
    for rnd in subgroups(rounds_root):
        for wav in subgroups(rounds_root[rnd]):
            path = f'rounds/{rnd}/{wav}'
            try:
                starts_empty[path]
            except KeyError:
                fail(f'Path not copied into new file: {path}')
Ejemplo n.º 5
0
def wavelength_set(organized_file: H5File) -> Set[str]:
    """Walks a reorganized raw-data or dA file and determines the set of all wavelengths present in the file

    Args:
        organized_file (H5File): A file whose data has already been organized into rounds

    Returns:
        The set of wavelengths present in the file
    """
    rounds_root = organized_file['rounds']
    wavelengths = set()
    for rnd_name in subgroups(rounds_root):
        rnd_wavelengths = subgroups(rounds_root[rnd_name])
        wavelengths.update(rnd_wavelengths)
    return wavelengths
Ejemplo n.º 6
0
def copy_rounds_structure_for_delta_a(old_file: H5File,
                                      new_file: H5File) -> None:
    """Copies the File/roundX/WWWWW group structure into a new, empty file for
    storing dA data in at a later time

    Args:
        old_file (H5File): The file whose structure will be copied
        new_file (H5File): An empty file that will have a group structure copied into it
    """
    rounds_root = old_file['rounds']
    for rnd in subgroups(rounds_root):
        for wav in subgroups(rounds_root[rnd]):
            path = f'rounds/{rnd}/{wav}'
            new_file.require_group(path)
    return
Ejemplo n.º 7
0
def combine_files(filenames: List[str], joined_name: str) -> None:
    """Combines multiple small files into one large file with the rounds renamed

    Args:
        filenames (List[str]): The list of filenames to combine
        joined_name (str): The name of the file to store the merged data in i.e. 'joined.h5'

    Note:
        Not covered by tests yet

    Warning:
        This function recombines files that have already been restructured. Do not use this
        on raw data files.
    """
    logger.info(f'Combining {filenames} into {joined_name}')
    new_file = h5py.File(joined_name, 'w', libver='latest')
    new_file.create_group('rounds')
    new_rounds_root = new_file['rounds']
    rnd_counter = 0
    for filename in filenames:
        logger.debug(f'Copying {filename} into {joined_name}')
        old_file = h5py.File(filename, 'r', libver='latest')
        old_rounds_root = old_file['rounds']
        for old_rnd_name in sorted(subgroups(old_rounds_root)):
            new_rnd_name = f'round{rnd_counter:0>3d}'
            new_rounds_root.create_group(new_rnd_name)
            recursive_copy(old_rounds_root[old_rnd_name], new_rounds_root[new_rnd_name])
            rnd_counter += 1
        old_file.close()
    new_file.close()
    return
Ejemplo n.º 8
0
def test_creating_wavelengths_top_level_group(organized_clean_data: H5File):
    """Ensures that a wavelength group is created for each wavelength present in the file
    """
    reorg.make_top_level_wavelengths_group(organized_clean_data)
    wav_root = organized_clean_data['wavelengths']
    wavelengths_present = subgroups(wav_root)
    assert set(wavelengths_present) == {'76487', '76715'}
Ejemplo n.º 9
0
def export_txt_data(data_group: Group,
                    sample_name: str,
                    folder_name: str,
                    include_cd: bool = False) -> None:  # noqa
    """Point this function at a group containing wavelength groups to export the data
    as CSV files.

    A folder with the specified name is created in the working directory, and under that
    folder a folder is created for each signal ('perp', 'par', and optionally 'cd'). The
    folder hierarchy is thus:
        <folder_name>
        |---perp
        |---par
        |---cd

    The data corresponding to each channel is placed in its respective folder. The
    filenames are formatted as follows:
        <sample_name>-<wavelength>-<signal>.txt

    Note that although the data is saved in CSV format, the file extension is `.txt`. This
    is for compatibility with Spectra Solve, which expects comma-delimitted data to have
    a `.txt` extension (for some reason).

    Args:
        data_group (Group): The group containing the data to export
        sample_name (str): A sample name to include in the filename of each piece of data
        folder_name (str): The name of the folder in which to store the exported data
        include_cd (bool): Indicates whether to export CD data
    """
    data_dir = Path.cwd() / folder_name
    if data_dir.exists() and data_dir.is_dir():
        logger.error(
            f'Directory {data_dir} already exists, please choose another name')
        return
    data_dir.mkdir()
    signals = ['perp', 'par']
    if include_cd:
        signals.append('cd')
    signal_dirs = {
        'perp': data_dir / 'perp',
        'par': data_dir / 'par',
    }
    if include_cd:
        signal_dirs['cd'] = data_dir / 'cd'
    for sig_dir in signal_dirs.values():
        sig_dir.mkdir()
    wavelengths = subgroups(data_group)
    for wav in wavelengths:
        wav_group = data_group[wav]
        time_data = wav_group['time'][...]
        for sig in signals:
            sig_data = wav_group[sig][...]
            two_column_data = np.column_stack((time_data, sig_data))
            file_path = signal_dirs[sig] / f'{sample_name}-{wav}-{sig}.txt'
            logger.info(
                f'Saving dataset {wav_group[sig].name} to file {file_path}')
            with file_path.open(mode='wb') as file:
                np.savetxt(file, two_column_data, delimiter=',')
    return
Ejemplo n.º 10
0
def test_subgroups(clean_raw_data):
    """Ensures that the correct subgroup names are reported

    Args:
        clean_raw_data (H5File): An HDF5 file containing clean test data
    """
    subgroup_names = subgroups(clean_raw_data)
    assert subgroup_names == ['spectrum1', 'spectrum2', 'spectrum3']
Ejemplo n.º 11
0
def recursive_copy(old_parent: Group, new_parent: Group) -> None:
    """Copies the contents of the old parent group to the new parent group

    Args:
        old_parent (Group): The group whose contents will be copied
        new_parent (Group): The group that will be copied into
    """
    if len(subgroups(old_parent)) == 0:
        for dset_name in datasets(old_parent):
            new_parent.create_dataset(dset_name,
                                      data=old_parent[dset_name][...],
                                      dtype=np.float32)
        return
    for group_name in subgroups(old_parent):
        new_parent.create_group(group_name)
        recursive_copy(old_parent[group_name], new_parent[group_name])
    return
Ejemplo n.º 12
0
def create_wavelength_groups_for_all_rounds(old_file: H5File, new_file: H5File) -> None:
    """Creates the subgroups in the new file for each wavelength present in a round

    Args:
        old_file (H5File): The original file with the experimental data
        new_file (H5File): The new file in which the data will be organized
    """
    rounds_root = new_file['rounds']
    rounds = sorted(subgroups(rounds_root))
    spectra = sorted(subgroups(old_file))
    spectra = [s for s in spectra if s != 'experiment_parameters']
    spectrum_map = make_spectrum_map(spectra)
    spectra_for_zip = sorted(spectrum_map.keys())
    for spec, rnd in zip(spectra_for_zip, rounds):
        real_spec_name = spectrum_map[spec]
        create_wavelength_groups_for_one_round(old_file[real_spec_name], rounds_root[rnd])
    return
Ejemplo n.º 13
0
def copy_datasets(old_file: H5File, new_file: H5File) -> None:
    """Walks the top level groups in the old file to copy datasets to the new file,
    dropping the last column in each dataset and renaming the dataset to either "pump"
    or "nopump"

    Args:
        old_file (H5File): The original file with the experimental data
        new_file (H5File): The new file in which the data will be organized
    """
    rounds_root = new_file['rounds']
    rounds = sorted(subgroups(rounds_root))
    spectra = [s for s in subgroups(old_file) if s != 'experiment_parameters']
    spectrum_map = make_spectrum_map(spectra)
    spectra_for_zip = sorted(spectrum_map.keys())
    for spec, rnd in zip(spectra_for_zip, rounds):
        real_spec_name = spectrum_map[spec]
        dataset_names = datasets(old_file[real_spec_name])
        pairs = sorted(pair_dataset_names(dataset_names))
        for wav, *dset_names in pairs:
            if len(dset_names) < 2:
                dset_old_path = f'/{real_spec_name}/{dset_names[0]}'
                dset = old_file[dset_old_path]
                dset_new_path = f'/rounds/{rnd}/{wav}/faulty1'
                new_file.create_group(dset_new_path)
                logger.debug(f'Copying dataset {dset_old_path} to {dset_new_path}')
                split_and_store_old_dataset(new_file[dset_new_path], dset)
                continue
            dset1_name = dset_names[0]
            dset2_name = dset_names[1]
            dset1_old_path = f'/{real_spec_name}/{dset1_name}'
            dset2_old_path = f'/{real_spec_name}/{dset2_name}'
            dset1 = old_file[dset1_old_path]
            dset2 = old_file[dset2_old_path]
            dset1_new_path, dset2_new_path = round_path_for_old_dataset(dset1, dset2, rnd, wav)
            new_file.create_group(dset1_new_path)
            new_file.create_group(dset2_new_path)
            new_group1 = new_file[dset1_new_path]
            new_group2 = new_file[dset2_new_path]
            logger.debug(f'Copying dataset {dset1_old_path} to {dset1_new_path}')
            split_and_store_old_dataset(new_group1, dset1)
            logger.debug(f'Copying dataset {dset2_old_path} to {dset2_new_path}')
            split_and_store_old_dataset(new_group2, dset2)
    return
Ejemplo n.º 14
0
def test_create_wavelength_groups_under_all_rounds(clean_raw_data, has_rounds_groups):
    """Ensures that all of the wavelength groups are created correctly in the new file
    """
    reorg.create_wavelength_groups_for_all_rounds(clean_raw_data, has_rounds_groups)
    rounds = ['round001', 'round002', 'round003']
    for rnd_name in rounds:
        wavelengths_created = subgroups(has_rounds_groups[f'rounds/{rnd_name}'])
        assert len(wavelengths_created) == 2
        assert '76487' in wavelengths_created
        assert '76715' in wavelengths_created
Ejemplo n.º 15
0
def test_create_wavelength_groups_under_one_round(clean_raw_data, starts_empty):
    """Ensures that all of the wavelengths present in a given spectrum of the original file
    end up in the new file
    """
    old_group: Group = clean_raw_data['spectrum1']
    reorg.create_wavelength_groups_for_one_round(old_group, starts_empty)
    # Note that `starts_empty` is no longer empty at this point
    wavelength_groups_created = subgroups(starts_empty)
    assert len(wavelength_groups_created) == 2
    for wav in ['76487', '76715']:
        assert wav in wavelength_groups_created
Ejemplo n.º 16
0
def test_correct_rounds_created_in_starts_empty(clean_raw_data, starts_empty):
    """Ensures that `spectrum1` gets created in the empty file as `round001`.

    Args:
        clean_raw_data (H5File): An HDF5 file containing clean test data
        starts_empty (H5File): An empty file for testing the creation of rounds
    """
    reorg.create_rounds_from_spectra(clean_raw_data, starts_empty)
    rounds = subgroups(starts_empty['rounds'])
    for rnd in ['round001', 'round002', 'round003']:
        assert rnd in rounds
Ejemplo n.º 17
0
def wavelengths_under_rounds_paths(
        src_file: H5File) -> Generator[str, None, None]:
    """Yields paths to each existing wavelength group under each round

    Args:
        src_file (H5File): The file containing the experimental data

    Yields:
        Paths to wavelength subgroups of each round
    """
    rounds_root = src_file['rounds']
    rounds = sorted(subgroups(rounds_root))
    wavelengths = sorted(subgroups(rounds_root[rounds[0]]))
    for rnd in rounds:
        for wav in wavelengths:
            path = f'/rounds/{rnd}/{wav}'
            try:
                src_file[path]
            except KeyError:
                continue
            yield path
Ejemplo n.º 18
0
def test_restructure_and_merge_renumbers_correctly(filenames_for_merging):
    """Verifies that the rounds are renumbered correctly during the merge.
    """
    restructure_and_merge(filenames_for_merging, joined_name='joined.h5')
    merged = h5py.File('joined.h5', 'r', libver='latest')
    renamed_rounds = subgroups(merged['rounds'])
    merged.close()
    remove('joined.h5')
    assert len(renamed_rounds) == 6
    for num in range(6):
        expected_round = f'round{num:0>3d}'
        assert expected_round in renamed_rounds
Ejemplo n.º 19
0
def print_missing(filenames: List[str]) -> None:
    """Walks the file looking for wavelength groups that don't have any pump or faulty subgroups

    Args:
        filenames (List[str]): The list of filenames to inspect
    """
    missing = []
    for filename in filenames:
        data = h5py.File(filename, 'r')
        rounds_root = data['rounds']
        for rnd_name in subgroups(rounds_root):
            for wav_name in subgroups(rounds_root[rnd_name]):
                wav_group = rounds_root[rnd_name + '/' + wav_name]
                contents = subgroups(wav_group)
                if len(contents) == 0:
                    path = data.filename[0:-3] + ': ' + wav_group.name
                    missing.append(path)
        data.close()
    print(f'Number of missing files: {len(missing)}')
    for path in missing:
        print(path)
    return
Ejemplo n.º 20
0
def all_signal_dataset_paths(src_file: H5File) -> Generator[str, None, None]:
    """Yields paths to each existing perp/par/ref dataset in a pump/nopump group

    Args:
        src_file (H5File): The file with the experiment data

    Yields:
        Paths to each dataset in the file that isn't a time dataset
    """
    rounds_root = src_file['rounds']
    rounds = sorted(subgroups(rounds_root))
    wavelengths = sorted(subgroups(rounds_root[rounds[0]]))
    for rnd in rounds:
        for wav in wavelengths:
            for pump in ['pump', 'nopump']:
                for channel in ['perp', 'par', 'ref']:
                    path = f'/rounds/{rnd}/{wav}/{pump}/{channel}'
                    try:
                        src_file[path]
                    except KeyError:
                        continue
                    yield path
Ejemplo n.º 21
0
def make_delta_a_wavelength_groups(src_file: H5File) -> None:
    """Takes the newly reorganized data and creates a new top level group that divides the data
    by wavelength rather than round.

    The structure of the new wavelength groups will be as follows:
    File
        wavelengths
            WWWWW
                roundX
                    pump
                    nopump
    Note that these new groups and datasets are simply hard links, not new copies of the data,
    so no space penalty is incurred by adding a new organizational structure like this.

    Note:
        Not covered by tests yet

    Args:
        src_file (H5File): A file that has already had its data reorganized into rounds
    """
    src_file.create_group('wavelengths')
    wav_root = src_file['wavelengths']
    all_wavelengths = wavelength_set(src_file)
    for wav_name in all_wavelengths:
        wav_root.create_group(wav_name)
    rounds_root = src_file['rounds']
    for rnd_name in subgroups(rounds_root):
        for wav_name in subgroups(rounds_root[rnd_name]):
            wav_root[wav_name].create_group(rnd_name)
            old_time_path = f'{rnd_name}/{wav_name}/time'
            old_perp_path = f'{rnd_name}/{wav_name}/perp'
            old_par_path = f'{rnd_name}/{wav_name}/par'
            new_time_path = f'{wav_name}/{rnd_name}/time'
            new_perp_path = f'{wav_name}/{rnd_name}/perp'
            new_par_path = f'{wav_name}/{rnd_name}/par'
            wav_root[new_time_path] = rounds_root[old_time_path]
            wav_root[new_perp_path] = rounds_root[old_perp_path]
            wav_root[new_par_path] = rounds_root[old_par_path]
    return
Ejemplo n.º 22
0
def paths_for_signal(src_file: H5File,
                     chan: DeltaAChannel) -> Generator[str, None, None]:
    """Yields the paths to each dataset corresponding to the specified signal

    Args:
        src_file (H5File): The file containing the experiment data
        chan (DeltaAChannel): The channel whose paths will be produced

    Yields:
        Paths to each of the datasets of the specified signal
    """
    rounds_root = src_file['rounds']
    rounds = sorted(subgroups(rounds_root))
    wavelengths = sorted(subgroups(rounds_root[rounds[0]]))
    for rnd in rounds:
        for wav in wavelengths:
            path = f'/rounds/{rnd}/{wav}/{chan.name}'
            try:
                src_file[path]
            except KeyError:
                continue
            yield path
Ejemplo n.º 23
0
def create_rounds_from_spectra(old_file: H5File, new_file: H5File) -> None:
    """Creates a group 'roundX' for each top level group in the original file named 'spectrumX'

    Args:
        old_file (H5File): The original file with the experimental data
        new_file (H5File): The new, empty file in which the data will be organized
    """
    spec_number_regex = re.compile(r'spectrum(\d+)')
    new_file.create_group('rounds')
    for s in subgroups(old_file):
        match = spec_number_regex.search(s)
        if match is None:
            continue
        spec_number = match.group(1)
        new_file['rounds'].create_group(f'round{spec_number:0>3s}')
    return
Ejemplo n.º 24
0
def channel_heatmap(file: H5File, func: Callable[[Array, Array], np.float64],
                    channel: InputChannel, pump: PumpStatus) -> Array:
    """Generates a (number of rounds)x(number of wavelengths) array by visiting each wavelength
    group in the file. The value at each position in the array is computed by
    the function provided.

    Note:
        The "func" argument must have the following signature:
            func(time: Array, signal: Array) -> np.float64

    Args:
        file (H5File): The file to generate the heatmap data from
        func (Callable): A function that will be used to compute the heatmap values
        channel (InputChannel): The input channel to use
        pump (PumpStatus): Which dataset (pump or nopump) the data should come from

    Returns:
        A (number of rounds)x(number of wavelengths) array that contains the heatmap pixels
    """
    rounds_root = file['rounds']
    rounds = subgroups(rounds_root)
    unsorted_wavelength_set = wavelength_set(file)
    wavelengths = sorted(list(unsorted_wavelength_set))
    pixels = np.ndarray(shape=(len(rounds), len(wavelengths)),
                        dtype=np.float64)
    for i in range(len(rounds)):
        for j in range(len(wavelengths)):
            group_path = f'{rounds[i]}/{wavelengths[j]}/{pump.value}'
            try:
                group = rounds_root[group_path]
            except KeyError:
                pixels[i, j] = 0
                continue
            time_dset = group['time']
            input_dset = group[f'{channel.value}']
            points = len(group['time'][...])
            # cast float32 data to float64 for computations to avoid rounding errors
            time_data = np.empty(points, dtype=np.float64)
            input_channel_data = np.empty(points, dtype=np.float64)
            time_dset.read_direct(time_data)
            input_dset.read_direct(input_channel_data)
            pixels[i, j] = func(time_data, input_channel_data)
    return pixels
Ejemplo n.º 25
0
def filter_faulty_groups(src_file: H5File) -> List[str]:
    """Returns the list of signal dataset paths contained in faulty groups

    Args:
        src_file (H5File): The file containing the experimental data

    Returns:
        The paths to all of the signal datasets in faulty groups
    """
    wav_paths = rawnav.wavelengths_under_rounds_paths(src_file)
    faulty_paths = []
    for wav_path in wav_paths:
        pump_groups = subgroups(src_file[wav_path])
        if 'faulty' not in pump_groups[0]:
            continue
        for p_name in pump_groups:
            pump_path = f'{wav_path}/{p_name}'
            paths_below = [p for p in dataset_paths_below(src_file, pump_path) if 'time' not in p]
            faulty_paths += paths_below
    return faulty_paths
Ejemplo n.º 26
0
def dataset_paths_below(src_file: H5File, group_path: str) -> List[str]:
    """Returns the paths of all datasets below the provided path.

    One use case for this function is returning the paths to each dataset below a group
    that has been deemed to contain bad data

    Args:
        src_file (DataFile): The file containing the experimental data
        group_path (str): The path to the parent group

    Returns:
        The paths of all datasets below the parent group
    """
    dataset_paths = []
    parent_group = src_file[group_path]
    for g_name in subgroups(parent_group):
        dataset_paths += dataset_paths_below(src_file, f'{group_path}/{g_name}')
    for d_name in datasets(parent_group):
        dataset_paths.append(f'{group_path}/{d_name}')
    return dataset_paths