Beispiel #1
0
def reshape_from_lines_to_pixels(h5_main, pts_per_cycle, scan_step_x_m=None):
    """
    Breaks up the provided raw G-mode dataset into lines and pixels (from just lines)

    Parameters
    ----------
    h5_main : h5py.Dataset object
        Reference to the main dataset that contains the raw data that is only broken up by lines
    pts_per_cycle : unsigned int
        Number of points in a single pixel
    scan_step_x_m : float
        Step in meters for pixels

    Returns
    -------
    h5_resh : h5py.Dataset object
        Reference to the main dataset that contains the reshaped data
    """
    if not check_if_main(h5_main):
        raise TypeError('h5_main is not a Main dataset')
    h5_main = USIDataset(h5_main)
    if pts_per_cycle % 1 != 0 or pts_per_cycle < 1:
        raise TypeError('pts_per_cycle should be a positive integer')
    if scan_step_x_m is not None:
        if not isinstance(scan_step_x_m, Number):
            raise TypeError('scan_step_x_m should be a real number')
    else:
        scan_step_x_m = 1

    if h5_main.shape[1] % pts_per_cycle != 0:
        warn('Error in reshaping the provided dataset to pixels. Check points per pixel')
        raise ValueError

    num_cols = int(h5_main.shape[1] / pts_per_cycle)

    # TODO: DO NOT assume simple 1 spectral dimension!
    single_ao = np.squeeze(h5_main.h5_spec_vals[:, :pts_per_cycle])

    spec_dims = Dimension(get_attr(h5_main.h5_spec_vals, 'labels')[0],
                          get_attr(h5_main.h5_spec_vals, 'units')[0], single_ao)

    # TODO: DO NOT assume simple 1D in positions!
    pos_dims = [Dimension('X', 'm', np.linspace(0, scan_step_x_m, num_cols)),
                Dimension('Y', 'm', np.linspace(0, h5_main.h5_pos_vals[1, 0], h5_main.shape[0]))]

    h5_group = create_results_group(h5_main, 'Reshape')
    # TODO: Create empty datasets and then write for very large datasets
    h5_resh = write_main_dataset(h5_group, (num_cols * h5_main.shape[0], pts_per_cycle), 'Reshaped_Data',
                                 get_attr(h5_main, 'quantity')[0], get_attr(h5_main, 'units')[0], pos_dims, spec_dims,
                                 chunks=(10, pts_per_cycle), dtype=h5_main.dtype, compression=h5_main.compression)

    # TODO: DON'T write in one shot assuming small datasets fit in memory!
    print('Starting to reshape G-mode line data. Please be patient')
    h5_resh[()] = np.reshape(h5_main[()], (-1, pts_per_cycle))

    print('Finished reshaping G-mode line data to rows and columns')

    return USIDataset(h5_resh)
Beispiel #2
0
    def __init__(self,
                 h5_main,
                 cores=None,
                 max_mem_mb=4 * 1024,
                 verbose=False):
        """
        Parameters
        ----------
        h5_main : h5py.Dataset instance
            The dataset over which the analysis will be performed. This dataset should be linked to the spectroscopic
            indices and values, and position indices and values datasets.
        cores : uint, optional
            Default - all available cores - 2
            How many cores to use for the computation
        max_mem_mb : uint, optional
            How much memory to use for the computation.  Default 1024 Mb
        verbose : Boolean, (Optional, default = False)
            Whether or not to print debugging statements
        """

        if h5_main.file.mode != 'r+':
            raise TypeError(
                'Need to ensure that the file is in r+ mode to write results back to the file'
            )

        if MPI is not None:
            # If we came here then, the user has intentionally asked for multi-node computation
            comm = MPI.COMM_WORLD
            self.mpi_comm = comm
            self.mpi_rank = comm.Get_rank()
            self.mpi_size = comm.Get_size()

            print("Rank {} of {} on {} sees {} logical cores on the socket".
                  format(comm.Get_rank(), comm.Get_size(),
                         socket.gethostname(), cpu_count()))

            # First, ensure that cores=logical cores in node. No point being economical / considerate
            cores = psutil.cpu_count()

            # It is sufficient if just one rank checks all this.
            if verbose and self.mpi_rank == 0:
                print('Working on {} nodes via MPI'.format(self.mpi_size))

            # Ensure that the file is opened in the correct comm or something
            if h5_main.file.driver != 'mpio':
                raise TypeError(
                    'The HDF5 file should have been opened with driver="mpio". Current driver = "{}"'
                    .format(h5_main.file.driver))
            """
            # Not sure how to check for this correctly
            messg = None
            try:
                if h5_main.file.comm != comm:
                    messg = 'The HDF5 file should have been opened with comm=MPI.COMM_WORLD. Currently comm={}'.format(h5_main.file.comm)
            except AttributeError:
                messg = 'The HDF5 file should have been opened with comm=MPI.COMM_WORLD'
            if messg is not None:
                raise TypeError(messg)
            """

        else:
            if verbose:
                print('No mpi4py found. Asssuming single node computation')
            self.mpi_comm = None
            self.mpi_size = 1
            self.mpi_rank = 0

        # Checking if dataset is "Main"
        if self.mpi_rank == 0:
            if not check_if_main(h5_main, verbose=verbose):
                raise ValueError(
                    'Provided dataset is not a "Main" dataset with necessary ancillary datasets'
                )
        # Not sure if we need a barrier here.

        # Saving these as properties of the object:
        self.h5_main = USIDataset(h5_main)
        self.verbose = verbose
        self._max_pos_per_read = None
        self._max_mem_mb = None

        # Now have to be careful here since the below properties are a function of the MPI rank
        self._start_pos = None
        self._rank_end_pos = None
        self._end_pos = None
        self.__assign_job_indices(start=0)

        # Determining the max size of the data that can be put into memory
        # all ranks go through this and they need to have this value any
        self._set_memory_and_cores(cores=cores, mem=max_mem_mb)
        self.duplicate_h5_groups = []
        self.partial_h5_groups = []
        self.process_name = None  # Reset this in the extended classes
        self.parms_dict = None

        self._results = None
        self.h5_results_grp = None

        if self.mpi_rank == 0:
            print(
                'Consider calling test() to check results before calling compute() which computes on the entire'
                ' dataset and writes back to the HDF5 file')
Beispiel #3
0
def create_empty_dataset(source_dset,
                         dtype,
                         dset_name,
                         h5_group=None,
                         new_attrs=None,
                         skip_refs=False):
    """
    Creates an empty dataset in the h5 file based on the provided dataset in the same or specified group
    Parameters
    ----------
    source_dset : h5py.Dataset object
        Source object that provides information on the group and shape of the dataset
    dtype : dtype
        Data type of the fit / guess datasets
    dset_name : String / Unicode
        Name of the dataset
    h5_group : h5py.Group object, optional. Default = None
        Group within which this dataset will be created
    new_attrs : dictionary (Optional)
        Any new attributes that need to be written to the dataset
    skip_refs : boolean, optional
        Should ObjectReferences and RegionReferences be skipped when copying attributes from the
        `source_dset`
    Returns
    -------
    h5_new_dset : h5py.Dataset object
        Newly created dataset
    """
    import h5py
    from pyUSID.io.dtype_utils import validate_dtype
    from pyUSID.io.hdf_utils import copy_attributes, check_if_main, write_book_keeping_attrs
    from pyUSID import USIDataset
    import sys
    if sys.version_info.major == 3:
        unicode = str

    if not isinstance(source_dset, h5py.Dataset):
        raise TypeError('source_deset should be a h5py.Dataset object')
    _ = validate_dtype(dtype)
    if new_attrs is not None:
        if not isinstance(new_attrs, dict):
            raise TypeError('new_attrs should be a dictionary')
    else:
        new_attrs = dict()

    if h5_group is None:
        h5_group = source_dset.parent
    else:
        if not isinstance(h5_group, (h5py.Group, h5py.File)):
            raise TypeError(
                'h5_group should be a h5py.Group or h5py.File object')

    if not isinstance(dset_name, (str, unicode)):
        raise TypeError('dset_name should be a string')
    dset_name = dset_name.strip()
    if len(dset_name) == 0:
        raise ValueError('dset_name cannot be empty!')
    if '-' in dset_name:
        warn(
            'dset_name should not contain the "-" character. Reformatted name from:{} to '
            '{}'.format(dset_name, dset_name.replace('-', '_')))
    dset_name = dset_name.replace('-', '_')

    if dset_name in h5_group.keys():
        if isinstance(h5_group[dset_name], h5py.Dataset):
            warn('A dataset named: {} already exists in group: {}'.format(
                dset_name, h5_group.name))
            h5_new_dset = h5_group[dset_name]
            # Make sure it has the correct shape and dtype
            if any((source_dset.shape != h5_new_dset.shape,
                    dtype != h5_new_dset.dtype)):
                warn(
                    'Either the shape (existing: {} desired: {}) or dtype (existing: {} desired: {}) of the dataset '
                    'did not match with expectations. Deleting and creating a new one.'
                    .format(h5_new_dset.shape, source_dset.shape,
                            h5_new_dset.dtype, dtype))
                del h5_new_dset, h5_group[dset_name]
                h5_new_dset = h5_group.create_dataset(
                    dset_name,
                    shape=source_dset.shape,
                    dtype=dtype,
                    chunks=source_dset.chunks)
        else:
            raise KeyError('{} is already a {} in group: {}'.format(
                dset_name, type(h5_group[dset_name]), h5_group.name))

    else:
        h5_new_dset = h5_group.create_dataset(dset_name,
                                              shape=source_dset.shape,
                                              dtype=dtype,
                                              chunks=source_dset.chunks)

    # This should link the ancillary datasets correctly
    h5_new_dset = copy_attributes(source_dset,
                                  h5_new_dset,
                                  skip_refs=skip_refs)
    h5_new_dset.attrs.update(new_attrs)

    if check_if_main(h5_new_dset):
        h5_new_dset = USIDataset(h5_new_dset)
        # update book keeping attributes
        write_book_keeping_attrs(h5_new_dset)

    return h5_new_dset
def reshape_from_lines_to_pixels(h5_main, pts_per_cycle, scan_step_x_m=None):
    """
    Breaks up the provided raw G-mode dataset into lines and pixels (from just lines)

    Parameters
    ----------
    h5_main : h5py.Dataset object
        Reference to the main dataset that contains the raw data that is only broken up by lines
    pts_per_cycle : unsigned int
        Number of points in a single pixel
    scan_step_x_m : float
        Step in meters for pixels

    Returns
    -------
    h5_resh : h5py.Dataset object
        Reference to the main dataset that contains the reshaped data
    """
    if not check_if_main(h5_main):
        raise TypeError('h5_main is not a Main dataset')
    h5_main = USIDataset(h5_main)
    if pts_per_cycle % 1 != 0 or pts_per_cycle < 1:
        raise TypeError('pts_per_cycle should be a positive integer')
    if scan_step_x_m is not None:
        if not isinstance(scan_step_x_m, Number):
            raise TypeError('scan_step_x_m should be a real number')
    else:
        scan_step_x_m = 1

    if h5_main.shape[1] % pts_per_cycle != 0:
        warn(
            'Error in reshaping the provided dataset to pixels. Check points per pixel'
        )
        raise ValueError

    num_cols = int(h5_main.shape[1] / pts_per_cycle)

    # TODO: DO NOT assume simple 1 spectral dimension!
    single_ao = np.squeeze(h5_main.h5_spec_vals[:, :pts_per_cycle])

    spec_dims = Dimension(
        get_attr(h5_main.h5_spec_vals, 'labels')[0],
        get_attr(h5_main.h5_spec_vals, 'units')[0], single_ao)

    # TODO: DO NOT assume simple 1D in positions!
    pos_dims = [
        Dimension('X', 'm', np.linspace(0, scan_step_x_m, num_cols)),
        Dimension('Y', 'm',
                  np.linspace(0, h5_main.h5_pos_vals[1, 0], h5_main.shape[0]))
    ]

    h5_group = create_results_group(h5_main, 'Reshape')
    # TODO: Create empty datasets and then write for very large datasets
    h5_resh = write_main_dataset(h5_group,
                                 (num_cols * h5_main.shape[0], pts_per_cycle),
                                 'Reshaped_Data',
                                 get_attr(h5_main, 'quantity')[0],
                                 get_attr(h5_main, 'units')[0],
                                 pos_dims,
                                 spec_dims,
                                 chunks=(10, pts_per_cycle),
                                 dtype=h5_main.dtype,
                                 compression=h5_main.compression)

    # TODO: DON'T write in one shot assuming small datasets fit in memory!
    print('Starting to reshape G-mode line data. Please be patient')
    h5_resh[()] = np.reshape(h5_main[()], (-1, pts_per_cycle))

    print('Finished reshaping G-mode line data to rows and columns')

    return USIDataset(h5_resh)
Beispiel #5
0
    def __init__(self,
                 h5_main,
                 *args,
                 verbose=False,
                 threading=False,
                 **kwargs):
        """
        Parameters
        ----------
        h5_main : :class:`~pyUSID.io.usi_data.USIDataset`
            The USID main HDF5 dataset over which the analysis will be performed.
        verbose : bool, Optional, default = False
            Whether or not to print debugging statements
        """

        if h5_main.file.mode != 'r+':
            raise TypeError(
                'Need to ensure that the file is in r+ mode to write results back to the file'
            )

        # Checking if dataset is "Main"
        if not check_if_main(h5_main):
            raise ValueError(
                'Provided dataset is not a "Main" dataset with necessary ancillary datasets'
            )

        # Saving these as properties of the object:
        #self.h5_main_chunks = h
        self.h5_main = USIDataset(h5_main)
        self.verbose = verbose
        self.threading = threading  #set false for distributed
        self.dtype = h5_main.dtype  #to redo dtype after dask array messes with it

        self.duplicate_h5_groups = []
        self.partial_h5_groups = []
        self.process_name = None  # Reset this in the extended classes
        self.parms_dict = None
        """
        The name of the HDF5 dataset that should be present to signify which positions have already been computed
        This is NOT a fully private variable so that multiple processes can be run within a single group - Eg Fitter
        In the case of Fitter - this name can be changed from 'completed_guesses' to 'completed_fits'
        check_for_duplicates will be called by the Child class where they have the opportunity to change this
        variable before checking for duplicates
        """
        self._status_dset_name = 'completed_positions'

        self._results = None
        self.h5_results_grp = None

        # Check to see if the resuming feature has been implemented:
        self._resume_implemented = False
        try:
            self._get_existing_datasets()
        except NotImplementedError:
            if verbose:
                print(
                    'It appears that this class may not be able to resume computations'
                )
        except:
            # NameError for variables that don't exist
            # AttributeError for self.var_name that don't exist
            # TypeError (NoneType) etc.
            self._resume_implemented = True

        print(
            'Consider calling test() to check results before calling compute() which computes on the entire'
            ' dataset and writes back to the HDF5 file')