Beispiel #1
0
    def test(self, override=False):
        """
        Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to
        write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the
        input matrix.

        Parameters
        ----------
        override : bool, optional. default = False
            Set to true to recompute results if prior results are available. Else, returns existing results

        Returns
        -------
        U : :class:`numpy.ndarray`
            Abundance matrix
        S : :class:`numpy.ndarray`
            variance vector
        V : :class:`numpy.ndarray`
            eigenvector matrix
        """
        '''
        Check if a number of compnents has been set and ensure that the number is less than
        the minimum axis length of the data.  If both conditions are met, use fsvd.  If not
        use the regular svd.

        C.Smith -- We might need to put a lower limit on num_comps in the future.  I don't
                   know enough about svd to be sure.
        '''
        if not override:
            if isinstance(self.duplicate_h5_groups, list) and len(self.duplicate_h5_groups) > 0:
                self.h5_results_grp = self.duplicate_h5_groups[-1]
                print('Returning previously computed results from: {}'.format(self.h5_results_grp.name))
                print('set the "override" flag to True to recompute results')
                return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \
                       reshape_to_n_dims(self.h5_results_grp['V'])[0]

        self.h5_results_grp = None

        t1 = time.time()

        self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func(self.h5_main), self.num_components,
                                                      n_iter=3)
        self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype)

        print('Took {} to compute randomized SVD'.format(format_time(time.time() - t1)))

        u_mat, success = reshape_to_n_dims(self.__u, h5_pos=self.h5_main.h5_pos_inds,
                                           h5_spec=np.expand_dims(np.arange(self.__u.shape[1]), axis=0))
        if not success:
            raise ValueError('Could not reshape U to N-Dimensional dataset! Error:' + success)

        v_mat, success = reshape_to_n_dims(self.__v, h5_pos=np.expand_dims(np.arange(self.__u.shape[1]), axis=1),
                                           h5_spec=self.h5_main.h5_spec_inds)
        if not success:
            raise ValueError('Could not reshape V to N-Dimensional dataset! Error:' + success)

        return u_mat, self.__s, v_mat
Beispiel #2
0
def format_time(time_in_seconds, decimals=2):
    """
    Formats the provided time in seconds to seconds, minutes, or hours

    Parameters
    ----------
    time_in_seconds : number
        Time in seconds
    decimals : uint, optional. default = 2
        Number of decimal places to which the time needs to be formatted

    Returns
    -------
    str
        String with time formatted correctly
    """
    warn(
        'pyUSID.io.io_utils.format_time has been moved to '
        'sidpy.base.string_utils.format_time. This copy in pyUSID will'
        'be removed in future release. Please update your import statements',
        FutureWarning)
    return sut.format_time(time_in_seconds, decimals=decimals)
Beispiel #3
0
    def compute(self, override=False, *args, **kwargs):
        """
        Creates placeholders for the results, applies the :meth:`~pyUSID.processing.process.Process._unit_computation`
        to chunks of the dataset

        Parameters
        ----------
        override : bool, optional. default = False
            By default, compute will simply return duplicate results to avoid recomputing or resume computation on a
            group with partial results. Set to True to force fresh computation.
        args : list
            arguments to the mapped function in the correct order
        kwargs : dict
            keyword arguments to the mapped function

        Returns
        -------
        h5_results_grp : :class:`h5py.Group`
            Group containing all the results
        """
        class SimpleFIFO(object):
            """
            Simple class that maintains a moving average of some numbers.
            """
            def __init__(self, length=5):
                """
                Create a SimpleFIFO object

                Parameters
                ----------
                length : unsigned integer
                    Number of values that need to be maintained for the moving average
                """
                self.__queue = list()
                if not isinstance(length, int):
                    raise TypeError('length must be a positive integer')
                if length <= 0:
                    raise ValueError('length must be a positive integer')
                self.__max_length = length
                self.__count = 0

            def put(self, item):
                """
                Adds the item to the internal queue. If the size of the queue exceeds its capacity, the oldest
                item is removed.

                Parameters
                ----------
                item : float or int
                    Any real valued number
                """
                if (not isinstance(item, Number)) or isinstance(item, complex):
                    raise TypeError(
                        'Provided item: {} is not a Number'.format(item))
                self.__queue.append(item)
                self.__count += 1
                if len(self.__queue) > self.__max_length:
                    _ = self.__queue.pop(0)

            def get_mean(self):
                """
                Returns the average of the elements within the queue

                Returns
                -------
                avg : number.Number
                    Mean of all elements within the queue
                """
                return np.mean(self.__queue)

            def get_cycles(self):
                """
                Returns the number of items that have been added to the queue in total

                Returns
                -------
                count : int
                    number of items that have been added to the queue in total
                """
                return self.__count

        if not override:
            if len(self.duplicate_h5_groups) > 0:
                if self.mpi_rank == 0:
                    print('Returned previously computed results at ' +
                          self.duplicate_h5_groups[-1].name)
                self.h5_results_grp = self.duplicate_h5_groups[-1]
                return self.duplicate_h5_groups[-1]
            elif len(self.partial_h5_groups
                     ) > 0 and self.h5_results_grp is None:
                if self.mpi_rank == 0:
                    print('Resuming computation in group: ' +
                          self.partial_h5_groups[-1].name)
                self.use_partial_computation()

        resuming = False
        if self.h5_results_grp is None:
            # starting fresh
            if self.verbose and self.mpi_rank == 0:
                print('Creating HDF5 group and datasets to hold results')
            self._create_results_datasets()
            self._write_source_dset_provenance()
        else:
            # resuming from previous checkpoint
            resuming = True
            self._get_existing_datasets()

        self.__create_compute_status_dataset()

        if resuming and self.mpi_rank == 0:
            percent_complete = int(
                100 * len(np.where(self._h5_status_dset[()] == 1)[0]) /
                self._h5_status_dset.shape[0])
            print('Resuming computation. {}% completed already'.format(
                percent_complete))

        self.__assign_job_indices()

        # Not sure if this is necessary but I don't think it would hurt either
        if self.mpi_comm is not None:
            self.mpi_comm.barrier()

        compute_times = SimpleFIFO(5)
        write_times = SimpleFIFO(5)
        orig_rank_start = self.__start_pos

        if self.mpi_rank == 0 and self.mpi_size == 1:
            if self.__resume_implemented:
                print(
                    '\tThis class (likely) supports interruption and resuming of computations!\n'
                    '\tIf you are operating in a python console, press Ctrl+C or Cmd+C to abort\n'
                    '\tIf you are in a Jupyter notebook, click on "Kernel">>"Interrupt"\n'
                    '\tIf you are operating on a cluster and your job gets killed, re-run the job to resume\n'
                )
            else:
                print(
                    '\tThis class does NOT support interruption and resuming of computations.\n'
                    '\tIn order to enable this feature, simply implement the _get_existing_datasets() function'
                )

        if self.verbose and self.mpi_rank == self.__socket_master_rank:
            print('Rank: {} - with nothing loaded has {} free memory'
                  ''.format(self.mpi_rank,
                            format_size(get_available_memory())))

        self._read_data_chunk()

        if self.mpi_comm is not None:
            self.mpi_comm.barrier()

        if self.verbose and self.mpi_rank == self.__socket_master_rank:
            print('Rank: {} - with only raw data loaded has {} free memory'
                  ''.format(self.mpi_rank,
                            format_size(get_available_memory())))

        while self.data is not None:

            num_jobs_in_batch = self.__end_pos - self.__start_pos

            t_start_1 = tm.time()

            self._unit_computation(*args, **kwargs)

            comp_time = np.round(tm.time() - t_start_1,
                                 decimals=2)  # in seconds
            time_per_pix = comp_time / num_jobs_in_batch
            compute_times.put(time_per_pix)

            if self.verbose:
                print(
                    'Rank {} - computed chunk in {} or {} per pixel. Average: {} per pixel'
                    '.'.format(self.mpi_rank, format_time(comp_time),
                               format_time(time_per_pix),
                               format_time(compute_times.get_mean())))

            # Ranks can become memory starved. Check memory usage - raw data + results in memory at this point
            if self.verbose and self.mpi_rank == self.__socket_master_rank:
                print(
                    'Rank: {} - now holding onto raw data + results has {} free memory'
                    ''.format(self.mpi_rank,
                              format_size(get_available_memory())))

            t_start_2 = tm.time()
            self._write_results_chunk()

            # NOW, update the positions. Users are NOT allowed to touch start and end pos
            self.__start_pos = self.__end_pos
            # Leaving in this provision that will allow restarting of processes
            if self.mpi_size == 1:
                self.h5_results_grp.attrs['last_pixel'] = self.__end_pos
            # Child classes don't even have to worry about flushing. Process will do it.
            self.h5_main.file.flush()

            dump_time = np.round(tm.time() - t_start_2, decimals=2)
            write_times.put(dump_time / num_jobs_in_batch)

            if self.verbose:
                print('Rank {} - wrote its {} pixel chunk in {}'.format(
                    self.mpi_rank, num_jobs_in_batch, format_time(dump_time)))

            time_remaining = (self.__rank_end_pos - self.__end_pos) * \
                             (compute_times.get_mean() + write_times.get_mean())

            if self.verbose or self.mpi_rank == 0:
                percent_complete = int(100 *
                                       (self.__end_pos - orig_rank_start) /
                                       (self.__rank_end_pos - orig_rank_start))
                print('Rank {} - {}% complete. Time remaining: {}'.format(
                    self.mpi_rank, percent_complete,
                    format_time(time_remaining)))

            # All ranks should mark the pixels for this batch as completed. 'last_pixel' attribute will be updated later
            # Setting each section to 1 independently
            for curr_slice in integers_to_slices(self.__pixels_in_batch):
                self._h5_status_dset[curr_slice] = 1

            self._read_data_chunk()

        if self.verbose:
            print('Rank {} - Finished computing all jobs!'.format(
                self.mpi_rank))

        if self.mpi_comm is not None:
            self.mpi_comm.barrier()

        if self.mpi_rank == 0:
            print('Finished processing the entire dataset!')

        # Update the legacy 'last_pixel' attribute here:
        if self.mpi_rank == 0:
            self.h5_results_grp.attrs['last_pixel'] = self.h5_main.shape[0]

        return self.h5_results_grp
Beispiel #4
0
    def test(self, rearrange_clusters=True, override=False):
        """
        Clusters the hdf5 dataset and calculates mean response for each cluster. This function does NOT write results to
        the hdf5 file. Call :meth:`~pycroscopy.processing.Cluster.compute()` to  write to the file.
        Handles complex, compound datasets such that the
        mean response vector for each cluster matrix is of the same data-type as the input matrix.

        Parameters
        ----------
        rearrange_clusters : bool, optional. Default = True
            Whether or not the clusters should be re-ordered by relative distances between the mean response
        override : bool, optional. default = False
            Set to true to recompute results if prior results are available. Else, returns existing results

        Returns
        -------
        labels : :class:`numpy.ndarray`
            1D unsigned integer array containing the cluster labels as obtained from the fit
        mean_response : :class:`numpy.ndarray`
            2D array containing the mean response for each cluster arranged as [cluster number, response]
        """
        if not override:
            if isinstance(self.duplicate_h5_groups,
                          list) and len(self.duplicate_h5_groups) > 0:
                self.h5_results_grp = self.duplicate_h5_groups[-1]
                print('Returning previously computed results from: {}'.format(
                    self.h5_results_grp.name))
                print('set the "override" flag to True to recompute results')
                return np.squeeze(reshape_to_n_dims(self.h5_results_grp['Labels'])[0]), \
                       reshape_to_n_dims(self.h5_results_grp['Mean_Response'])[0]

        self.h5_results_grp = None

        t1 = time.time()

        print('Performing clustering on {}.'.format(self.h5_main.name))
        # perform fit on the real dataset
        results = self.estimator.fit(
            self.data_transform_func(self.h5_main[self.data_slice]))

        print('Took {} to compute {}'.format(format_time(time.time() - t1),
                                             self.method_name))

        t1 = time.time()
        self.__mean_resp = self._get_mean_response(results.labels_)
        print('Took {} to calculate mean response per cluster'.format(
            format_time(time.time() - t1)))

        self.__labels = results.labels_
        if rearrange_clusters:
            self.__labels, self.__mean_resp = reorder_clusters(
                results.labels_, self.__mean_resp, self.data_transform_func)

        # TODO: What if test() is called repeatedly?
        labels_mat, success = reshape_to_n_dims(
            np.expand_dims(np.squeeze(self.__labels), axis=1),
            h5_pos=self.h5_main.h5_pos_inds,
            h5_spec=np.expand_dims([0], axis=0))
        if not success:
            raise ValueError(
                'Could not reshape labels to N-Dimensional dataset! Error:' +
                success)

        centroid_mat, success = reshape_to_n_dims(
            self.__mean_resp,
            h5_spec=self.h5_main.h5_spec_inds[:, :self.num_comps],
            h5_pos=np.expand_dims(np.arange(self.__mean_resp.shape[0]),
                                  axis=1))

        if not success:
            raise ValueError(
                'Could not reshape mean response to N-Dimensional dataset! Error:'
                + success)

        return np.squeeze(labels_mat), centroid_mat
    def test(self, override=False):
        """
        Decomposes the hdf5 dataset to calculate the components and projection. This function does NOT write results to
        the hdf5 file. Call :meth:`~pycroscopy.processing.Decomposition.compute()` to  write to the file. Handles
        complex, compound datasets such that the
        components are of the same data-type as the input matrix.

        Parameters
        ----------
        override : bool, optional. default = False
            Set to true to recompute results if prior results are available. Else, returns existing results

        Returns
        -------
        components : :class:`numpy.ndarray`
            Components
        projections : :class:`numpy.ndarray`
            Projections
        """
        if not override:
            if isinstance(self.duplicate_h5_groups,
                          list) and len(self.duplicate_h5_groups) > 0:
                self.h5_results_grp = self.duplicate_h5_groups[-1]
                print('Returning previously computed results from: {}'.format(
                    self.h5_results_grp.name))
                print('set the "override" flag to True to recompute results')
                return USIDataset(self.h5_results_grp['Components']).get_n_dim_form(), \
                       USIDataset(self.h5_results_grp['Projection']).get_n_dim_form()

        self.h5_results_grp = None

        print('Performing Decomposition on {}.'.format(self.h5_main.name))

        t0 = time.time()
        self._fit()
        self._transform()
        print('Took {} to compute {}'.format(format_time(time.time() - t0),
                                             self.method_name))

        self.__components = stack_real_to_target_dtype(
            self.estimator.components_, self.h5_main.dtype)
        projection_mat, success = reshape_to_n_dims(
            self.__projection,
            h5_pos=self.h5_main.h5_pos_inds,
            h5_spec=np.expand_dims(np.arange(self.__projection.shape[1]),
                                   axis=0))
        if not success:
            raise ValueError(
                'Could not reshape projections to N-Dimensional dataset! Error:'
                + success)

        components_mat, success = reshape_to_n_dims(
            self.__components,
            h5_spec=self.h5_main.h5_spec_inds,
            h5_pos=np.expand_dims(np.arange(self.__components.shape[0]),
                                  axis=1))

        if not success:
            raise ValueError(
                'Could not reshape components to N-Dimensional dataset! Error:'
                + success)

        return components_mat, projection_mat