Ejemplo n.º 1
0
    def execute(self, **kwargs):
        """
        Overwrite the default implementation of execute to update parameter specifications/types
        when wrapping functions where the types are not known a priori.

        :param kwargs: Custom analysis parameters

        :return: The result of execute_analysis()
        """
        # Update the dtype of all the input parameters to ensure we save them correctly to file
        log_helper.debug(__name__, "Setting parameters based on the given inputs")
        ana_dtypes = data_dtypes.get_dtypes()
        for k, v in kwargs.iteritems():
            for param in self.parameters:
                if param['name'] == k:
                    if hasattr(v, 'dtype'):
                        param['dtype'] = ana_dtypes['ndarray']
                    else:
                        param['dtype'] = type(v)
        # Determine the custom parameters
        custom_parameters = kwargs

        # Execute the analysis as usual
        result = super(analysis_generic, self).execute(**custom_parameters)
        return result
Ejemplo n.º 2
0
    def __read_all(self, filename):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes.
        """

        self.data = np.zeros(shape=self.shape, dtype=self.data_type)
        log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape])
        reader = ImzMLParser(filename)
        log_helper.debug(__name__,'READING ALL DATA!! GIVE ME RAM (please)!')

        # Compute the bin edges for reinterpolation if needed
        if self.imzml_type == self.available_imzml_types['processed']:
            shift = np.diff(self.mz).mean()
            bin_edges = np.append(self.mz, self.mz[-1]+ shift)
        else:
            bin_edges = None
        for ind in xrange(0, len(reader.coordinates)):
            xidx, yidx = reader.coordinates[ind]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            # Read the spectrum
            mz, intens = reader.getspectrum(ind)
            # Reinterpolate intensities if we are in processed mode
            if bin_edges is not None:
                f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False)
                intens = f(self.mz)
                #intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens)
            # Save the intensity values in our data cube
            self.data[xidx, yidx, :] = intens
Ejemplo n.º 3
0
    def __init__(self,
                 analysis_objects=None):
        """
        Initialize the workflow executor

        :param analysis_objects: A list of analysis objects to be executed
        """
        super(workflow_executor_base, self).__init__()
        log_helper.debug(__name__, "Creating workflow executor")
        if analysis_objects is not None:
            if not isinstance(analysis_objects, list) and not isinstance(analysis_objects, set):
                analysis_objects = [analysis_objects, ]
        log_helper.log_var(__name__, analysis_objects=analysis_objects, level='DEBUG')
        self.run_info = run_info_dict()
        self.analysis_tasks = analysis_task_list(analysis_objects) \
            if analysis_objects is not None \
            else analysis_task_list()
        self.mpi_comm = mpi_helper.get_comm_world()
        self.mpi_root = 0
        self.workflow_identifier = "we"
        # self.parameters = []  # Inherited from parameter_manager and set in parent class

        dtypes = data_dtypes.get_dtypes()
        self.add_parameter(name='profile_time_and_usage',
                           help='Enable/disable profiling of time and usage of the whole workflow',
                           required=False,
                           default=False,
                           dtype=dtypes['bool'])
        self.add_parameter(name='profile_memory',
                           help='Enable/disable profiling of memory usage of the whole workflow',
                           required=False,
                           default=False,
                           dtype=dtypes['bool'])
Ejemplo n.º 4
0
    def enable_profile_memory(self, enable=True):
        """
        Enable/disable profiling of memory usage

        :param enable: boolean to enable (True) or disable (False) memory profiling

        """
        if PROFILE_MEMORY_AVAILABLE:
            if not enable and self.__profile_memory:
                log_helper.debug(__name__,
                                 "Disabled memory profiling. ",
                                 root=self.mpi_root,
                                 comm=self.mpi_comm)
            if enable and not self.__profile_memory:
                log_helper.debug(__name__,
                                 "Enabled memory profiling. ",
                                 root=self.mpi_root,
                                 comm=self.mpi_comm)
            self.__profile_memory = enable
        else:
            self.__profile_memory = False
            if enable:
                log_helper.warning(
                    __name__, 'Profiling of memory usage not available.' +
                    ' Missing memory_profiler or StringIO package')
Ejemplo n.º 5
0
    def record_postexecute(self, execution_time=None):
        """
        Function used to record runtime information after the task we want to track is comleted, e.g.
        the `execute_analysis(...)` function of a standard analysis.

        The function may be overwritten in child classes to add recording of
        additional runtime information.

        When overwriting the function we should call super(...,self).runinfo_record_postexecute(execution_time)
        in the custom version to ensure that the execution and end_time are properly
        recorded.

        :param execution_time: The total time it took to execute the analysis. May be None, in which
            case the function will attempt to compute the execution time based on the start_time
            (if available) and the the current time.

        :param comm: Used for logging only. The MPI communicator to be used. Default value is None,
            in which case MPI.COMM_WORLD is used.

        """
        log_helper.debug(__name__, 'Recording post-execution runtime data', root=self.mpi_root, comm=self.mpi_comm)
        # Finalize recording of post execution provenance
        self['end_time'] = unicode(datetime.datetime.now())
        if execution_time is not None:
            self['execution_time'] = unicode(execution_time)
        elif 'start_time' in self:
            start_time = run_info_dict.string_to_time(self['start_time'])
            stop_time = run_info_dict.string_to_time(self['end_time'])
            self['execution_time'] = unicode(stop_time - start_time)    # TODO: This only gives execution time in full seconds right now
        else:
            self['execution_time'] = None
        # Attempt to record psutil data
        try:
            import psutil
            process = psutil.Process()
            self['memory_info_after'] = unicode(process.memory_info())
        except ImportError:
            log_helper.warning(__name__, 'psutil not installed. Recording of part of runtime information not possible',
                               root=self.mpi_root, comm=self.mpi_comm)
        except:
            warnings.warn("Recording of psutil-based runtime information failed: "+str(sys.exc_info()))

        # Record the time and use profiling data if possible
        if self.__time_and_use_profiler is not None:
            self.__time_and_use_profiler.disable()
            self.__time_and_use_profiler.create_stats()
            self['profile'] = unicode(self.__time_and_use_profiler.stats)
            # Save the summary statistics for the profiling data
            stats_io = StringIO.StringIO()
            profiler_stats = pstats.Stats(self.__time_and_use_profiler, stream=stats_io).sort_stats('cumulative')
            profiler_stats.print_stats()
            self['profile_stats'] = stats_io.getvalue()

        # Record the memory profiling data if possible
        if self.__memory_profiler is not None and self.get_profile_memory():
            log_helper.debug(__name__, 'Recording memory profiling data', root=self.mpi_root, comm=self.mpi_comm)
            mem_stats_io = StringIO.StringIO()
            memory_profiler.show_results(self.__memory_profiler, stream=mem_stats_io)
            self['profile_mem'] = unicode(self.__memory_profiler.code_map)
            self['profile_mem_stats'] = mem_stats_io.getvalue()
Ejemplo n.º 6
0
    def execute(self, **kwargs):
        """
        Overwrite the default implementation of execute to update parameter specifications/types
        when wrapping functions where the types are not known a priori.

        :param kwargs: Custom analysis parameters

        :return: The result of execute_analysis()
        """
        # Update the dtype of all the input parameters to ensure we save them correctly to file
        log_helper.debug(__name__,
                         "Setting parameters based on the given inputs")
        ana_dtypes = data_dtypes.get_dtypes()
        for k, v in kwargs.iteritems():
            for param in self.parameters:
                if param['name'] == k:
                    if hasattr(v, 'dtype'):
                        param['dtype'] = ana_dtypes['ndarray']
                    else:
                        param['dtype'] = type(v)
        # Determine the custom parameters
        custom_parameters = kwargs

        # Execute the analysis as usual
        result = super(analysis_generic, self).execute(**custom_parameters)
        return result
Ejemplo n.º 7
0
    def enable_profile_time_and_usage(self, enable=True):
        """
        Enable/disable time and usage profiling

        :param enable: boolean to enable (True) or disable (False) time and usage profiling

        """
        if PROFILE_AVAILABLE:
            if not enable and self.__profile_time_and_usage:
                log_helper.debug(__name__,
                                 "Disabled time and usage profiling. ",
                                 root=self.mpi_root,
                                 comm=self.mpi_comm)
            if enable and not self.__profile_time_and_usage:
                log_helper.debug(__name__,
                                 "Enabled time and usage profiling. ",
                                 root=self.mpi_root,
                                 comm=self.mpi_comm)
            self.__profile_time_and_usage = enable
        else:
            self.__profile_time_and_usage = False
            if enable:
                log_helper.warning(
                    __name__, 'Profiling of time and usage not available.' +
                    ' Missing profile and/or pstats package')
Ejemplo n.º 8
0
    def append(self, analysis_object):
        """
        Add a given analysis to the set of object to be executed by the workflow

        This is the same as set.add() but we ensure that only analysis_base objects
        are added.

        :param analysis_object: Analysis object to be added to the execution.
            All dependencies of the analysis will also be executed as part of the
            execution.
        :type analysis_object: omsi.analysis.base.analysis_base

        :raises: ValueError is raised if the given analysis_object is invalid
        """
        from omsi.analysis.base import analysis_base
        if isinstance(analysis_object, analysis_base):
            if analysis_object in self:
                log_helper.debug(__name__,
                                 "Analysis already in the list of tasks")
                return
            log_helper.info(
                __name__, "Adding analysis object to the workflow set. " +
                str(analysis_object))
            super(analysis_task_list, self).append(analysis_object)
        else:
            raise ValueError(
                'Analysis is not of type omsi.analysis.base.analysis_base')
Ejemplo n.º 9
0
    def gather(self):
        """
        Simple helper function to gather the runtime information---that has been collected on
        multiple processes when running using MPI---on a single root process

        :return: If we have more than one processes then this function returns a
            dictionary with the same keys as usual for the run_info but the
            values are now lists with one entry per mpi processes. If we only have
            a single process, then the run_info object will be returned without
            changes. NOTE: Similar to mpi gather, the function only collects
            information on the root. All other processes will return just their
            own private runtime information.

        """
        if mpi_helper.MPI_AVAILABLE:
            if self.mpi_comm.Get_size() > 1:
                log_helper.debug(__name__,
                                 'Gather runtime data from parallel tasks',
                                 root=self.mpi_root,
                                 comm=self.mpi_comm)
                self['mpi_rank'] = self.mpi_comm.Get_rank()
                run_data = self.mpi_comm.gather(self, self.mpi_root)
                if self.mpi_comm.Get_rank() == self.mpi_root:
                    merged_run_data = {}
                    for run_dict in run_data:
                        for key in run_dict:
                            try:
                                merged_run_data[key].append(run_dict[key])
                            except KeyError:
                                merged_run_data[key] = [run_dict[key]]
                    return merged_run_data
        return self
Ejemplo n.º 10
0
    def gather(self):
        """
        Simple helper function to gather the runtime information---that has been collected on
        multiple processes when running using MPI---on a single root process

        :return: If we have more than one processes then this function returns a
            dictionary with the same keys as usual for the run_info but the
            values are now lists with one entry per mpi processes. If we only have
            a single process, then the run_info object will be returned without
            changes. NOTE: Similar to mpi gather, the function only collects
            information on the root. All other processes will return just their
            own private runtime information.

        """
        if mpi_helper.MPI_AVAILABLE:
            if self.mpi_comm.Get_size() > 1:
                log_helper.debug(__name__, 'Gather runtime data from parallel tasks',
                                 root=self.mpi_root, comm=self.mpi_comm)
                self['mpi_rank'] = self.mpi_comm.Get_rank()
                run_data = self.mpi_comm.gather(self, self.mpi_root)
                if self.mpi_comm.Get_rank() == self.mpi_root:
                    merged_run_data = {}
                    for run_dict in run_data:
                        for key in run_dict:
                            try:
                                merged_run_data[key].append(run_dict[key])
                            except KeyError:
                                merged_run_data[key] = [run_dict[key]]
                    return merged_run_data
        return self
Ejemplo n.º 11
0
    def __read_all(self, filename):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes.
        """

        self.data = np.zeros(shape=self.shape, dtype=self.data_type)
        log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape])
        reader = ImzMLParser(filename)
        log_helper.debug(__name__, 'READING ALL DATA!! GIVE ME RAM (please)!')

        # Compute the bin edges for reinterpolation if needed
        if self.imzml_type == self.available_imzml_types['processed']:
            shift = np.diff(self.mz).mean()
            bin_edges = np.append(self.mz, self.mz[-1] + shift)
        else:
            bin_edges = None
        for ind in xrange(0, len(reader.coordinates)):
            xidx, yidx = reader.coordinates[ind]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            # Read the spectrum
            mz, intens = reader.getspectrum(ind)
            # Reinterpolate intensities if we are in processed mode
            if bin_edges is not None:
                intens, bin_edges_new = np.histogram(mz,
                                                     bins=bin_edges,
                                                     weights=intens)
            # Save the intensity values in our data cube
            self.data[xidx, yidx, :] = intens
Ejemplo n.º 12
0
    def clear(self):
        """
        Remove all analyses from the workflow.

        Shorthand for: self.analysis_tasks.clear()
        """
        log_helper.debug(__name__, "Clearing the workflow", root=self.mpi_root, comm=self.mpi_comm)
        self.analysis_tasks.clear()
Ejemplo n.º 13
0
    def __read_all(self):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes
        """

        self.data = [
            np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type)
            for scan_idx, scantype in enumerate(self.scan_types)
        ]

        for scan_idx, scantype in enumerate(self.scan_types):
            reader = mzml.read(self.basename)
            spectrumid = 0
            if not self.scan_profiled[scan_idx]:
                shift = np.diff(self.mz_all[scan_idx]).mean()
                bin_edges = np.append(self.mz_all[scan_idx],
                                      self.mz_all[scan_idx][-1] + shift)
            else:
                bin_edges = None

            for spectrum in reader:
                if spectrum['scanList']['scan'][0][
                        'filter string'] == scantype:
                    x = spectrum['m/z array']
                    try:
                        y = spectrum['intensity array']
                    except KeyError:
                        raise KeyError
                    if bin_edges is None:
                        yi = np.interp(
                            self.mz_all[scan_idx], x, y, 0,
                            0)  # Re-interpolate the data in profiled mode
                    else:
                        yi, _ = np.histogram(
                            x, bins=bin_edges, weights=y
                        )  # Re-histogram the data in centroided mode
                    xidx = np.nonzero(
                        self.x_pos == self.coordinates[spectrumid, 0])[0]
                    yidx = np.nonzero(
                        self.y_pos == self.coordinates[spectrumid, 1])[0]
                    try:
                        self.data[scan_idx][xidx, yidx, :] = yi
                    except:
                        log_helper.debug(__name__, spectrumid, scan_idx,
                                         scantype, self.mz_all[scan_idx].shape)
            # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly
                if spectrumid % 1000 == 0:
                    log_helper.info(
                        __name__,
                        'Processed data for %s spectra to datacube for scan type %s'
                        % (spectrumid, scantype))
                spectrumid += 1
Ejemplo n.º 14
0
    def define_missing_parameters(self):
        """
        Set any required parameters that have not been defined to their respective default values.

        This function may be overwritten in child classes to customize
        the definition of default parameter values and to apply any
        modifications (or checks) of parameters before the analysis is executed.
        Any changes applied here will be recorded in the parameter of the analysis.
        """
        log_helper.debug(__name__, "Define missing parameters to default")
        for param in self.parameters:
            if param['required'] and not param.data_set():
                param['data'] = param['default']
Ejemplo n.º 15
0
    def define_missing_parameters(self):
        """
        Set any required parameters that have not been defined to their respective default values.

        This function may be overwritten in child classes to customize
        the definition of default parameter values and to apply any
        modifications (or checks) of parameters before the analysis is executed.
        Any changes applied here will be recorded in the parameter of the analysis.
        """
        log_helper.debug(__name__, "Define missing parameters to default")
        for param in self.parameters:
            if param['required'] and not param.data_set():
                param['data'] = param['default']
Ejemplo n.º 16
0
    def execute(self):
        """
        Execute the workflow. This uses the main() function to run the actual workflow.
        """
        log_helper.debug(__name__, "Execute", root=self.mpi_root, comm=self.mpi_comm)
        result = self.run_info(self.main)()
        try:
            log_helper.debug(__name__, 'Execution time: ' + str(self.run_info['execution_time']) + "s",
                             root=self.mpi_root, comm=self.mpi_comm)
        except (KeyError, ValueError):
            pass

        # 4) Return the result of the execution execution
        return result
Ejemplo n.º 17
0
    def set_parameter_default_value(self, name, value):
        """
        Set the default value of the parameter with the given name

        :param name: Name of the parameter
        :param value: New value

        :raises: KeyError if parameter not found
        """
        log_helper.debug(__name__, "Setting default value of " +str(name) + " to " + str(value))
        param = self.get_parameter_data_by_name(dataname=name)
        if isinstance(param, parameter_data):
            param['default'] = value
        else:
            raise KeyError('Unknown parameter ' + str(name))
Ejemplo n.º 18
0
    def __setitem__(self, key, value):
        """
        Set worflow driver parameter options directly via slicing

        Overwrite this function in child classes to implement custom setting behavior, e.g., error
        checking for valid values before setting a non-standard parameter.

        :param key: name of the parameters
        :param value: new value

        :raise: ValueError if an invalid value is given
        :raise: KeyError if an invalid key is given
        """
        log_helper.debug(__name__, 'Setting parameter ' + key, root=self.mpi_root, comm=self.mpi_comm)
        return super(workflow_executor_base, self).__setitem__(key, value)
Ejemplo n.º 19
0
 def create_analysis_object(self):
     """
     Initialize the analysis object, i.e., set self.analysis_object
     """
     if self.analysis_class is not None:
         if not isinstance(self.analysis_object, self.analysis_class):
             self.analysis_object = None
         if self.analysis_object is None:
             log_helper.debug(__name__, 'Initalizing analysis object', root=self.mpi_root, comm=self.mpi_comm)
             self.analysis_object = None if self.analysis_class is None else self.analysis_class()
             self.analysis_object.mpi_root = self.mpi_root
             self.analysis_object.mpi_comm = self.mpi_comm
         else:
             pass
     else:
         self.analysis_object = None
Ejemplo n.º 20
0
    def insert(self, index, analysis_object):
        """
        Insert a given analysis object at the given location

        :param index: Location where the obejct should be inserted
        :param analysis_object: The analysis object to be inserted

        """
        from omsi.analysis.base import analysis_base
        if isinstance(analysis_object, analysis_base):
            if analysis_object in self:
                log_helper.debug(__name__, "Analysis already in the list of tasks")
                return
            log_helper.info(__name__, "Inserting analysis object in the workflow list. " + str(analysis_object))
            super(analysis_task_list, self).insert(index, analysis_object)
        else:
            raise ValueError('Analysis is not of type omsi.analysis.base.analysis_base')
Ejemplo n.º 21
0
    def set_parameter_default_value(self, name, value):
        """
        Set the default value of the parameter with the given name

        :param name: Name of the parameter
        :param value: New value

        :raises: KeyError if parameter not found
        """
        log_helper.debug(
            __name__,
            "Setting default value of " + str(name) + " to " + str(value))
        param = self.get_parameter_data_by_name(dataname=name)
        if isinstance(param, parameter_data):
            param['default'] = value
        else:
            raise KeyError('Unknown parameter ' + str(name))
Ejemplo n.º 22
0
    def main(self):
        """Execute the analysis workflow"""
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty")
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow")
        log_helper.info(__name__, "Adding all dependencies")
        self.add_analysis_dependencies()

        # Record the runtime information
        log_helper.debug(__name__, "Recording runtime information")
        self.run_info.clear()
        self.run_info.record_preexecute()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow")
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(
                        analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__,
                                     "Execute analysis: " + str(analysis))
                    analysis.execute()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.")
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(
                    __name__,
                    "Workflow could not be fully executed. " + str(num_tasks) +
                    " remain in the queue but cannot be completed due to unresolved dependencies."
                )
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG')

        # Record the runtime information after we are done with the workflow
        self.run_info.record_postexecute()
        self.run_info.gather()
Ejemplo n.º 23
0
    def make_analysis_identifiers_unique(self):
        """
        Update analysis identifiers to be unique.

        Side effects: This function updates the analysis tasks stored in the set

        :return: self, i.e., the modified object with identifiers updated
        """
        identifiers = self.get_all_analysis_identifiers()
        unique_identifiers = list(set(identifiers))
        num_update = len(identifiers) - len(unique_identifiers)
        if num_update > 0:
            log_helper.debug(__name__, "%i analyses have non-unique identifiers and will be updated" % num_update)
        ana_index = 0
        for ana in self:
            current_identifier = ana.get_analysis_identifier()
            if current_identifier not in unique_identifiers:
                ana.set_analysis_identifier('ana_' + str(ana_index) + "_" + unicode(current_identifier))
            ana_index += 1
        return self
Ejemplo n.º 24
0
    def enable_profile_time_and_usage(self, enable=True):
        """
        Enable/disable time and usage profiling

        :param enable: boolean to enable (True) or disable (False) time and usage profiling

        """
        if PROFILE_AVAILABLE:
            if not enable and self.__profile_time_and_usage:
                log_helper.debug(__name__, "Disabled time and usage profiling. ",
                                 root=self.mpi_root, comm=self.mpi_comm)
            if enable and not self.__profile_time_and_usage:
                log_helper.debug(__name__, "Enabled time and usage profiling. ",
                                 root=self.mpi_root, comm=self.mpi_comm)
            self.__profile_time_and_usage = enable
        else:
            self.__profile_time_and_usage = False
            if enable:
                log_helper.warning(__name__, 'Profiling of time and usage not available.' +
                                   ' Missing profile and/or pstats package')
Ejemplo n.º 25
0
    def enable_profile_memory(self, enable=True):
        """
        Enable/disable profiling of memory usage

        :param enable: boolean to enable (True) or disable (False) memory profiling

        """
        if PROFILE_MEMORY_AVAILABLE:
            if not enable and self.__profile_memory:
                log_helper.debug(__name__, "Disabled memory profiling. ",
                                 root=self.mpi_root, comm=self.mpi_comm)
            if enable and not self.__profile_memory:
                log_helper.debug(__name__, "Enabled memory profiling. ",
                                 root=self.mpi_root, comm=self.mpi_comm)
            self.__profile_memory = enable
        else:
            self.__profile_memory = False
            if enable:
                log_helper.warning(__name__, 'Profiling of memory usage not available.' +
                                   ' Missing memory_profiler or StringIO package')
Ejemplo n.º 26
0
    def __compute_scan_types_and_indices(self, filename=None):
        """
        Internal helper function used to compute a list of unique scan types in the mzml file.
        Also computes a numpy 1d array of ints which index every scan to relevant datacube.
        """
        reader = mzml.read(filename)
        scantypes = []
        scan_indices = []
        scan_profiled = []
        for idx, spectrum in enumerate(reader):
            try:
                scanfilter = spectrum['scanList']['scan'][0]['filter string']
                if scanfilter not in scantypes:
                    scantypes.append(scanfilter)
                    scan_profiled.append(spectrum.has_key('profile spectrum'))
                scan_indices.append(scantypes.index(scanfilter))
            except:
                log_helper.debug(__name__, idx)

        assert len(scan_indices) == self.num_scans
        return scantypes, scan_indices, scan_profiled
Ejemplo n.º 27
0
    def add_analysis_from_scripts(self,
                                  script_files):
        """
        Evaluate the list of scripts and add all (i.e., zero, one, or multiple) analyses to this workflow

        NOTE: This function executes scripts using exec(..), i.e., there are NO safeguards against malicious codes.

        :param script_files: List of strings with the paths to the script files. If only a single
            script is used, then a single string may be used as well.

        """
        new_analysis_objects = analysis_task_list.from_script_files(script_files)
        if new_analysis_objects is not None and len(new_analysis_objects) > 0:
            log_helper.debug(__name__,
                             "Adding %i new analyses to the workflow from scripts" % len(new_analysis_objects),
                             root=self.mpi_root, comm=self.mpi_comm)
            self.analysis_tasks = self.analysis_tasks.union(new_analysis_objects)
        else:
            log_helper.debug(__name__,
                             "No analysis found in scripts",
                             root=self.mpi_root, comm=self.mpi_comm)
Ejemplo n.º 28
0
    def __compute_scan_types_and_indices(self, filename=None):
        """
        Internal helper function used to compute a list of unique scan types in the mzml file.
        Also computes a numpy 1d array of ints which index every scan to relevant datacube.
        """
        reader = mzml.read(filename)
        scantypes = []
        scan_indices = []
        scan_profiled = []
        for idx, spectrum in enumerate(reader):
            try:
                scanfilter = spectrum['scanList']['scan'][0]['filter string']
                if scanfilter not in scantypes:
                    scantypes.append(scanfilter)
                    scan_profiled.append(spectrum.has_key('profile spectrum'))
                scan_indices.append(scantypes.index(scanfilter))
            except:
                log_helper.debug(__name__, idx)

        assert len(scan_indices) == self.num_scans
        return scantypes, scan_indices, scan_profiled
Ejemplo n.º 29
0
    def clean_up(self):
        """
        Clean up the runinfo object. In particular remove empty keys that
        either recorded None or recorded just an empty string.

        This function may be overwritten to also do clean-up needed
        due to additional custom runtime instrumentation.

        When overwriting this function we should call super(..., self).runinfo_clean_up()
        at the end of the function to ensure that the runinfo dictionary
        is clean, i.e., does not contain any empty entries.

        """
        log_helper.debug(__name__, 'Clean up runtime data', root=self.mpi_root, comm=self.mpi_comm)
        # Remove empty items from the run_info dict
        for ri_key, ri_value in self.items():
            try:
                if ri_value is None or len(ri_value) == 0:
                    self.pop(ri_key)
            except:
                pass
Ejemplo n.º 30
0
    def add_parameter(self,
                      name,
                      help,
                      dtype=unicode,
                      required=False,
                      default=None,
                      choices=None,
                      data=None,
                      group=None):
        """
        Add a new parameter for the analysis. This function is typically used in the constructor
        of a derived analysis to specify the parameters of the analysis.

        :param name: The name of the parameter
        :param help: Help string describing the parameter
        :param dtype: Optional type. Default is string.
        :param required: Boolean indicating whether the parameter is required (True) or optional (False). Default False.
        :param default: Optional default value for the parameter. Default None.
        :param choices: Optional list of choices with allowed data values. Default None, indicating no choices set.
        :param data: The data assigned to the parameter. None by default.
        :param group: Optional group string used to organize parameters. Default None, indicating that
            parameters are automatically organized by driver class (e.g. in required and optional parameters)

        :raises: ValueError is raised if the parameter with the given name already exists.
        """
        log_helper.debug(__name__, "Add parameter " + str(name))
        if self.get_parameter_data_by_name(name) is not None:
            raise ValueError('A parameter with the name ' + unicode(name) +
                             " already exists.")
        self.parameters.append(
            parameter_data(name=name,
                           help=help,
                           dtype=dtype,
                           required=required,
                           default=default,
                           choices=choices,
                           data=data,
                           group=group))
Ejemplo n.º 31
0
    def __read_all(self):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes
        """

        self.data = [np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type) for scan_idx, scantype in enumerate(self.scan_types)]

        for scan_idx, scantype in enumerate(self.scan_types):
            reader = mzml.read(self.basename)
            spectrumid = 0
            if not self.scan_profiled[scan_idx]:
                shift = np.diff(self.mz_all[scan_idx]).mean()
                bin_edges = np.append(self.mz_all[scan_idx], self.mz_all[scan_idx][-1]+ shift)
            else:
                bin_edges = None

            for spectrum in reader:
                if spectrum['scanList']['scan'][0]['filter string'] == scantype:
                    x = spectrum['m/z array']
                    try:
                        y = spectrum['intensity array']
                    except KeyError:
                        raise KeyError
                    if bin_edges is None:
                        yi = np.interp(self.mz_all[scan_idx], x, y, 0, 0)  # Re-interpolate the data in profiled mode
                    else:
                         yi, _ = np.histogram(x, bins=bin_edges, weights=y)   # Re-histogram the data in centroided mode
                    xidx = np.nonzero(self.x_pos == self.coordinates[spectrumid, 0])[0]
                    yidx = np.nonzero(self.y_pos == self.coordinates[spectrumid, 1])[0]
                    try:
                        self.data[scan_idx][xidx, yidx, :] = yi
                    except:
                        log_helper.debug(__name__, spectrumid, scan_idx, scantype, self.mz_all[scan_idx].shape)
            # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly
                if spectrumid%1000 == 0:
                    log_helper.info(__name__, 'Processed data for %s spectra to datacube for scan type %s' % (spectrumid, scantype))
                spectrumid += 1
Ejemplo n.º 32
0
 def execute_analysis(self):
     """
     Nothing to do here.
     """
     if self['__analysis_function'] is not None:
         log_helper.debug(
             __name__,
             "Compiling the input dict for the analysis function.")
         input_dict = {}
         for arg in self.parameters:
             if arg['data'] is not None and arg['name'] not in [
                     '__analysis_function', 'profile_time_and_usage',
                     'profile_memory'
             ]:
                 if isinstance(arg['data'], dependency_dict):
                     input_dict[arg['name']] = arg['data'].get_data()
                 else:
                     input_dict[arg['name']] = arg['data']
         # When we restored the analysis we did not know that the parameter was supposed to be unicode
         log_helper.debug(__name__, "Unpickel the analysis function")
         # Convert to string as we stored the pickle string as uint8 array to avoid problems
         # with HDF5, NULL, and special chars
         analysis_function = self['__analysis_function'].tostring()
         analysis_function = pickle.loads(analysis_function)
         log_helper.debug(__name__, "Executing the analysis function")
         result = analysis_function(**input_dict)
         log_helper.debug(
             __name__, "Creating output data names and returning results")
         if isinstance(result, tuple):
             if len(self.data_names) >= len(result):
                 pass
             else:
                 self.data_names = [
                     (self.DEFAULT_OUTPUT_PREFIX + str(i))
                     for i in range(len(self.data_names), len(result))
                 ]
         elif result is None:
             self.data_names = []
         else:
             if len(self.data_names) >= 1:
                 pass
             else:
                 self.data_names = [self.DEFAULT_OUTPUT_PREFIX + '0']
         return result
     else:
         raise NotImplementedError(
             "We cannot run this analysis. Analysis_generic cannot run " +
             "an analysis unless an analysis function is set.")
Ejemplo n.º 33
0
    def __setitem__(self, key, value):
        """
        Set parameter options directly via slicing

        Overwrite this function in child classes to implement custom setting behavior, e.g., error
        checking for valid values before setting a non-standard parameter.

        :param key: name of the parameters
        :param value: new value

        :raise: ValueError if an invalid value is given
        :raise: KeyError if an invalid key is given
        """
        # Check if we have a valid key
        param_set = False
        if isinstance(key, basestring):
            for param in self.parameters:
                if param['name'] == key:
                    log_helper.debug(__name__, "Setting parameter " + key)
                    param['data'] = value
                    param_set = True
        if not param_set:
            raise KeyError('Invalid parameter key')
Ejemplo n.º 34
0
    def create_workflow_executor_object(self):
        """
        Initialize the workflow executor object, i.e., set self.workflow_executor

        *Side effects* This function potentially modifies self.workflow_executor

        """
        if self.workflow_executor is None:
            log_helper.debug(__name__,
                             'Initializing workflow executor',
                             root=self.mpi_root,
                             comm=self.mpi_comm)
            default_executor_class = workflow_executor_base.get_default_executor_class(
            )
            if self.script_files is None or len(self.script_files) == 0:
                self.workflow_executor = default_executor_class()
            else:
                self.workflow_executor = default_executor_class.from_script_files(
                    self.script_files)
            self.workflow_executor.mpi_root = self.mpi_root
            self.workflow_executor.mpi_comm = self.mpi_comm
        else:
            pass
Ejemplo n.º 35
0
    def __setitem__(self, key, value):
        """
        Set parameter options directly via slicing

        Overwrite this function in child classes to implement custom setting behavior, e.g., error
        checking for valid values before setting a non-standard parameter.

        :param key: name of the parameters
        :param value: new value

        :raise: ValueError if an invalid value is given
        :raise: KeyError if an invalid key is given
        """
        # Check if we have a valid key
        param_set = False
        if isinstance(key, basestring):
            for param in self.parameters:
                if param['name'] == key:
                    log_helper.debug(__name__, "Setting parameter " + key)
                    param['data'] = value
                    param_set = True
        if not param_set:
            raise KeyError('Invalid parameter key')
Ejemplo n.º 36
0
    def append(self, analysis_object):
        """
        Add a given analysis to the set of object to be executed by the workflow

        This is the same as set.add() but we ensure that only analysis_base objects
        are added.

        :param analysis_object: Analysis object to be added to the execution.
            All dependencies of the analysis will also be executed as part of the
            execution.
        :type analysis_object: omsi.analysis.base.analysis_base

        :raises: ValueError is raised if the given analysis_object is invalid
        """
        from omsi.analysis.base import analysis_base
        if isinstance(analysis_object, analysis_base):
            if analysis_object in self:
                log_helper.debug(__name__, "Analysis already in the list of tasks")
                return
            log_helper.debug(__name__, "Adding analysis object to the workflow set. " + str(analysis_object))
            super(analysis_task_list, self).append(analysis_object)
        else:
            raise ValueError('Analysis is not of type omsi.analysis.base.analysis_base')
Ejemplo n.º 37
0
    def main(self):
        """Execute the analysis workflow"""
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty")
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow")
        log_helper.info(__name__, "Adding all dependencies")
        self.add_analysis_dependencies()

        # Record the runtime information
        log_helper.debug(__name__, "Recording runtime information")
        self.run_info.clear()
        self.run_info.record_preexecute()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow")
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis))
                    analysis.execute()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.")
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies.")
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG')

        # Record the runtime information after we are done with the workflow
        self.run_info.record_postexecute()
        self.run_info.gather()
Ejemplo n.º 38
0
    def clean_up(self):
        """
        Clean up the runinfo object. In particular remove empty keys that
        either recorded None or recorded just an empty string.

        This function may be overwritten to also do clean-up needed
        due to additional custom runtime instrumentation.

        When overwriting this function we should call super(..., self).runinfo_clean_up()
        at the end of the function to ensure that the runinfo dictionary
        is clean, i.e., does not contain any empty entries.

        """
        log_helper.debug(__name__,
                         'Clean up runtime data',
                         root=self.mpi_root,
                         comm=self.mpi_comm)
        # Remove empty items from the run_info dict
        for ri_key, ri_value in self.items():
            try:
                if ri_value is None or len(ri_value) == 0:
                    self.pop(ri_key)
            except:
                pass
Ejemplo n.º 39
0
    def add_parameter(self,
                      name,
                      help,
                      dtype=unicode,
                      required=False,
                      default=None,
                      choices=None,
                      data=None,
                      group=None):
        """
        Add a new parameter for the analysis. This function is typically used in the constructor
        of a derived analysis to specify the parameters of the analysis.

        :param name: The name of the parameter
        :param help: Help string describing the parameter
        :param dtype: Optional type. Default is string.
        :param required: Boolean indicating whether the parameter is required (True) or optional (False). Default False.
        :param default: Optional default value for the parameter. Default None.
        :param choices: Optional list of choices with allowed data values. Default None, indicating no choices set.
        :param data: The data assigned to the parameter. None by default.
        :param group: Optional group string used to organize parameters. Default None, indicating that
            parameters are automatically organized by driver class (e.g. in required and optional parameters)

        :raises: ValueError is raised if the parameter with the given name already exists.
        """
        log_helper.debug(__name__, "Add parameter " + str(name))
        if self.get_parameter_data_by_name(name) is not None:
            raise ValueError('A parameter with the name ' + unicode(name) + " already exists.")
        self.parameters.append(parameter_data(name=name,
                                              help=help,
                                              dtype=dtype,
                                              required=required,
                                              default=default,
                                              choices=choices,
                                              data=data,
                                              group=group))
Ejemplo n.º 40
0
 def execute_analysis(self):
     """
     Nothing to do here.
     """
     if self['__analysis_function'] is not None:
         log_helper.debug(__name__, "Compiling the input dict for the analysis function.")
         input_dict = {}
         for arg in self.parameters:
             if arg['data'] is not None and arg['name'] not in ['__analysis_function', 'profile_time_and_usage', 'profile_memory']:
                 if isinstance(arg['data'], dependency_dict):
                     input_dict[arg['name']] = arg['data'].get_data()
                 else:
                     input_dict[arg['name']] = arg['data']
         # When we restored the analysis we did not know that the parameter was supposed to be unicode
         log_helper.debug(__name__, "Unpickel the analysis function")
         # Convert to string as we stored the pickle string as uint8 array to avoid problems
         # with HDF5, NULL, and special chars
         analysis_function = self['__analysis_function'].tostring()
         analysis_function = pickle.loads(analysis_function)
         log_helper.debug(__name__, "Executing the analysis function")
         result = analysis_function(**input_dict)
         log_helper.debug(__name__, "Creating output data names and returning results")
         if isinstance(result, tuple):
             if len(self.data_names) >= len(result):
                 pass
             else:
                 self.data_names = [(self.DEFAULT_OUTPUT_PREFIX + str(i))
                                    for i in range(len(self.data_names), len(result))]
         elif result is None:
             self.data_names = []
         else:
             if len(self.data_names) >= 1:
                 pass
             else:
                 self.data_names = [self.DEFAULT_OUTPUT_PREFIX + '0']
         return result
     else:
         raise NotImplementedError("We cannot run this analysis. Analysis_generic cannot run " +
                                   "an analysis unless an analysis function is set.")
Ejemplo n.º 41
0
    def main(self):
        """
        Execute the analysis workflow
        """
        # Do the optional MPI barrier
        if self['synchronize']:
            mpi_helper.barrier(comm=self.mpi_comm)

        # Check if we have anything to do at all
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm)
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm)
        log_helper.info(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm)
        self.add_analysis_dependencies()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm)
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis),
                                     root=self.mpi_root, comm=self.mpi_comm)
                    analysis.execute()
                    if self['reduce_memory_usage']:
                        analysis.clear_and_restore()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm)
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies.",
                                   root=self.mpi_root, comm=self.mpi_comm)
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
Ejemplo n.º 42
0
    def __init__(self, basename, requires_slicing=True, resolution=5):
        """
        Open an img file for data reading.

        :param basename: The name of the mzml file. If basename is a directory, then the first mzML file found
                             in the directory will be used instead.
        :type basename: string

        :param requires_slicing: Should the complete data be read into memory
                             (this makes slicing easier). (default is True)
        :type requires_slicing: bool

        :param resolution: For profile data only, the minimum m/z spacing to use for creating the "full" reprofiled
                            data cube
        :type resolution: float
        """
        # Determine the correct base
        if os.path.isdir(basename):
            filelist = self.get_files_from_dir(basename)
            if len(filelist) > 0:
                basename = filelist[0]
            else:
                raise ValueError("No valid mzML file found in the given directory.")
        # self.basename = basename
        # self.requires_slicing = requires_slicing
        
        # Call super constructor. This sets self.basename and self.readall
        super(xmassmzml_file, self).__init__(basename=basename, requires_slicing=requires_slicing)
        self.resolution = resolution
        self.data_type = 'uint32'  # TODO What data type should we use for the interpolated data?
        self.num_scans = self.__compute_num_scans(filename=self.basename)
        log_helper.info(__name__, 'Read %s scans from mzML file.' % self.num_scans)
        log_helper.debug(__name__, 'Compute coordinates')
        self.coordinates = self.__compute_coordinates(filename=self.basename,num_scans=self.num_scans)
        # Compute the spatial configuration of the matrix
        self.x_pos = np.unique(self.coordinates[:, 0])
        self.y_pos = np.unique(self.coordinates[:, 1])
        self.step_size = min([min(np.diff(self.x_pos)), min(np.diff(self.y_pos))])

        # Compute the mz axis
        log_helper.debug(__name__, 'Compute mz axes')
        self.mz = self.__compute_mz_axis(filename=self.basename)
        log_helper.debug(__name__, 'mz axes computed')

        # Determine the shape of the dataset, result is a list of shapes for each datacube
        # self.shape_all_data = [(self.x_pos.shape[0], self.y_pos.shape[0], mz.shape[0]) for mz in self.mz_all

        log_helper.debug(__name__, 'Compute shape')
        self.shape = (self.x_pos.shape[0], self.y_pos.shape[0], len(self.mz))#self.shape[0])
        # self.shape = None
        # self.mz = None

        # Read the data into memory
        # self.data = None
        log_helper.debug(__name__, 'read all')
        if requires_slicing:
            self.data = self.__read_all()
        log_helper.debug(__name__, 'Finished with init')
Ejemplo n.º 43
0
    def record_postexecute(self, execution_time=None):
        """
        Function used to record runtime information after the task we want to track is comleted, e.g.
        the `execute_analysis(...)` function of a standard analysis.

        The function may be overwritten in child classes to add recording of
        additional runtime information.

        When overwriting the function we should call super(...,self).runinfo_record_postexecute(execution_time)
        in the custom version to ensure that the execution and end_time are properly
        recorded.

        :param execution_time: The total time it took to execute the analysis. May be None, in which
            case the function will attempt to compute the execution time based on the start_time
            (if available) and the the current time.

        :param comm: Used for logging only. The MPI communicator to be used. Default value is None,
            in which case MPI.COMM_WORLD is used.

        """
        log_helper.debug(__name__,
                         'Recording post-execution runtime data',
                         root=self.mpi_root,
                         comm=self.mpi_comm)
        # Finalize recording of post execution provenance
        self['end_time'] = unicode(datetime.datetime.now())
        if execution_time is not None:
            self['execution_time'] = unicode(execution_time)
        elif 'start_time' in self:
            start_time = run_info_dict.string_to_time(self['start_time'])
            stop_time = run_info_dict.string_to_time(self['end_time'])
            self['execution_time'] = unicode(
                stop_time - start_time
            )  # TODO: This only gives execution time in full seconds right now
        else:
            self['execution_time'] = None
        # Attempt to record psutil data
        try:
            import psutil
            process = psutil.Process()
            self['memory_info_after'] = unicode(process.memory_info())
        except ImportError:
            log_helper.warning(
                __name__,
                'psutil not installed. Recording of part of runtime information not possible',
                root=self.mpi_root,
                comm=self.mpi_comm)
        except:
            warnings.warn(
                "Recording of psutil-based runtime information failed: " +
                str(sys.exc_info()))

        # Record the time and use profiling data if possible
        if self.__time_and_use_profiler is not None:
            self.__time_and_use_profiler.disable()
            self.__time_and_use_profiler.create_stats()
            self['profile'] = unicode(self.__time_and_use_profiler.stats)
            # Save the summary statistics for the profiling data
            stats_io = StringIO.StringIO()
            profiler_stats = pstats.Stats(
                self.__time_and_use_profiler,
                stream=stats_io).sort_stats('cumulative')
            profiler_stats.print_stats()
            self['profile_stats'] = stats_io.getvalue()

        # Record the memory profiling data if possible
        if self.__memory_profiler is not None and self.get_profile_memory():
            log_helper.debug(__name__,
                             'Recording memory profiling data',
                             root=self.mpi_root,
                             comm=self.mpi_comm)
            mem_stats_io = StringIO.StringIO()
            memory_profiler.show_results(self.__memory_profiler,
                                         stream=mem_stats_io)
            self['profile_mem'] = unicode(self.__memory_profiler.code_map)
            self['profile_mem_stats'] = mem_stats_io.getvalue()
Ejemplo n.º 44
0
    def record_preexecute(self):
        """
        Record basic runtime information in this dict before the exeuction is started.


        Function used to record runtime information prior to executing the process we want to track, e.g.,
        the `execute_analysis(...)` of a standard analysis.

        The function may be overwritten in child classes to add recording of
        additional runtime information. All runtime data should be recorded in the
        main dict (i.e, self). This ensures in the case of standard analysis that
        the data is stored in the HDF5 file. Other data should be stored in separate
        variables that we may add to the object.

        When overwriting the function we should typically call super(...,self).runinfo_record_pretexecute()
        last in the custom version to ensure that the start_time is properly recorded right before
        the execution of the analysis.

        """
        log_helper.debug(__name__,
                         'Recording pre-execution runtime data',
                         root=self.mpi_root,
                         comm=self.mpi_comm)
        # Record basic runtime environment information using the platform module
        try:
            self['architecture'] = unicode(platform.architecture())
            self['java_ver'] = unicode(platform.java_ver())
            self['libc_ver'] = unicode(platform.libc_ver())
            self['linux_distribution'] = unicode(platform.linux_distribution())
            self['mac_ver'] = unicode(platform.mac_ver())
            self['machine'] = unicode(platform.machine())
            self['node'] = unicode(platform.node())
            self['platform'] = unicode(platform.platform())
            self['processor'] = unicode(platform.processor())
            self['python_branch'] = unicode(platform.python_branch())
            self['python_build'] = unicode(platform.python_build())
            self['python_compiler'] = unicode(platform.python_compiler())
            self['python_implementation'] = unicode(
                platform.python_implementation())
            self['python_revision'] = unicode(platform.python_revision())
            self['python_version'] = unicode(platform.python_version())
            self['release'] = unicode(platform.release())
            self['system'] = unicode(platform.system())
            self['uname'] = unicode(platform.uname())
            self['version'] = unicode(platform.version())
            self['win32_ver'] = unicode(platform.win32_ver())
        except:
            warnings.warn(
                "WARNING: Recording of platform provenance failed: " +
                str(sys.exc_info()))

        # Attempt to record the svn version information
        try:
            import subprocess
            self['svn_ver'] = subprocess.check_output('svnversion').rstrip(
                '\n')
        except ImportError:
            log_helper.warning(
                __name__,
                'Recording of svn version not possible. subprocess not installed',
                root=self.mpi_root,
                comm=self.mpi_comm)
        except:
            warnings.warn("Recording of svn version information failed: " +
                          str(sys.exc_info()))

        # Attempt to record software library version
        try:
            import numpy as np
            self['numpy_version_full_version'] = unicode(
                np.version.full_version)
            self['numpy_version_release'] = unicode(np.version.release)
            self['numpy_version_git_revision'] = unicode(
                np.version.git_revision)
        except ImportError:
            log_helper.warning(__name__,
                               'Recording of numpy version not possible.',
                               root=self.mpi_root,
                               comm=self.mpi_comm)

        # Attempt to record psutil data
        try:
            import psutil
            self['logical_cpu_count'] = unicode(psutil.cpu_count())
            self['cpu_count'] = unicode(psutil.cpu_count(logical=False))
            process = psutil.Process()
            self['open_files'] = unicode(process.open_files())
            self['memory_info_before'] = unicode(process.memory_info())
        except ImportError:
            log_helper.warning(
                __name__,
                'psutil not installed. Recording of part of runtime information not possible',
                root=self.mpi_root,
                comm=self.mpi_comm)
        except:
            warnings.warn(
                "Recording of psutil-based runtime information failed: " +
                str(sys.exc_info()))

        # Record the start time for the analysis
        self['start_time'] = unicode(datetime.datetime.now())

        # Enable time and usage profiling if requested
        if self.__profile_time_and_usage:
            self.__time_and_use_profiler = Profile()
            self.__time_and_use_profiler.enable()
Ejemplo n.º 45
0
    def from_function(cls, analysis_function, output_names=None, parameter_specs=None, name_key="undefined"):
        """
        Create a generic analysis class for a given analysis function.

        This functionality is useful to ease quick scripting on analyses but should not be used in production.

        NOTE: __analysis_function is a reserved parameter name used to store the analysis function and may
        not be used as an input parameter for the analysis function.

        :param analysis_function: The analysis function to be wrapped for provenance tracking and storage
        :param output_names: Optionally, define a list of the names of the outputs
        :param parameter_specs: Optional list of omsi.datastructures.analysis_data.parameter_data with
            additional information about the parameters of the function.
        :param name_key: The name for the analysis, i.e., the analysis  identifier

        :return: A new generic analysis class
        """
        log_helper.debug(__name__, "Creating generic analysis from function")
        ana_dtypes = data_dtypes.get_dtypes()
        generic_analysis = cls(name_key=name_key)
        generic_analysis.real_analysis_type = analysis_function.__code__.co_name
        function_argcount = analysis_function.__code__.co_argcount   # Get the number of function parameters
        function_args = analysis_function.__code__.co_varnames[0:function_argcount] # Get the function arguments
        # Get the default values for the function parameters
        function_defaults = ()
        if hasattr(analysis_function, 'func_defaults'):
            if analysis_function.func_defaults is not None:
                function_defaults = analysis_function.func_defaults
        function_nondefaults = function_argcount - len(function_defaults)
        default_pos = 0
        # Add all parameters of the function to our generic analysis
        for varindex, varname in enumerate(function_args):
            # Determine the default value (if any) for the current parameter
            has_default = varindex >= function_nondefaults
            default = None
            if has_default:
                default = function_defaults[default_pos]
                default_pos += 1
            # Check if the user has supplied an additional specification for the current parameter
            param_spec = None
            if parameter_specs is not None:
                for ps in parameter_specs:
                    if isinstance(ps, dict) or isinstance(ps, parameter_data):
                        if ps['name'] == varname:
                            param_spec = ps
                    else:
                        raise ValueError("Invalid parameter specification. Spec is not a dict or parameter_data object")
             # Try to determine the dtype from the default values of the function
            dtype = None
            if default is not None:
                if isinstance(default, list) or isinstance(default, np.ndarray):
                    dtype = ana_dtypes['ndarray']
                elif isinstance(default, bool):
                    dtype = ana_dtypes['bool']
                elif isinstance(default, basestring):
                    dtype=str
                else:
                    for k, v in ana_dtypes.iteritems():
                        try:
                            if isinstance(default, v):
                                dtype = v
                                break
                        except:
                            pass
            # Add the parameter to our analysis
            if param_spec is None:
                generic_analysis.add_parameter(name=varname,
                                               help=' ',
                                               dtype=dtype,
                                               default=default)
            else:
                generic_analysis.add_parameter(
                        name=varname,
                        help=' ' if 'help' not in param_spec else param_spec['help'],
                        dtype=dtype if 'dtype' not in param_spec else param_spec['dtype'],
                        required=(not has_default) if 'required' not in param_spec else param_spec['required'],
                        default=default if 'default' not in param_spec else param_spec['default'],
                        choices=None if 'choices' not in param_spec else param_spec['choices'],
                        group=None if 'group' not in param_spec else param_spec['group'],
                        data=None if 'data' not in param_spec else param_spec['data'])
        # Add the analysis function as an internal parameter to our analysis
        generic_analysis.add_parameter(name='__analysis_function',
                                       help='The analysis function we want to execute',
                                       dtype=ana_dtypes['ndarray'])
        # Assign the names of the outputs
        if output_names is not None:
            generic_analysis.data_names = output_names
        # Pickle out analysis function and save it
        generic_analysis['__analysis_function'] = np.fromstring(cloudpickle.dumps(analysis_function), cls.PICKLE_DTYPE)
        # Return our initalized analysis
        return generic_analysis
Ejemplo n.º 46
0
    def main(self):
        """
        Default main function for running an analysis from the command line.
        The default implementation exposes all specified analysis parameters as command
        line options to the user. The default implementation also provides means to
        print a help text for the function.

        :raises: ValueError is raised in case that the analysis class is unknown

        """

        # Initialize the argument parser
        if self.parser is None:
            self.initialize_argument_parser()

        try:
            # Parse the command line arguments to determine the command line driver settings
            self.parse_cl_arguments()
        except:
            self.remove_output_target()
            raise

        if self.workflow_executor is None:
            self.remove_output_target()
            log_helper.error(
                __name__,
                'Missing --script parameter or worfklow_executor object')
            raise ValueError('Workflow not initalized')

        # Add and parse the command line arguments specific to the analysis to determine the analysis settings
        try:
            self.add_and_parse_workflow_arguments()
        except:
            self.remove_output_target()
            raise

        # Print the analysis settings
        if mpi_helper.get_rank() == self.mpi_root:
            self.print_settings()

        # Enable time and usage profiling
        try:
            # Enable time and usage profiling if requested
            if self.profile_analyses:
                try:
                    self.workflow_executor.analysis_tasks.enable_time_and_usage_profiling(
                        self.profile_analyses)
                except ImportError as e:
                    log_helper.warning(
                        __name__,
                        "Profiling of time and usage not available due to missing packages."
                    )
                    log_helper.warning(__name__, e.message)
            # Enable memory profiling if requested
            if self.profile_analyses_mem:
                try:
                    self.workflow_executor.analysis_tasks.enable_memory_profiling(
                        self.profile_analyses_mem)
                except ImportError as e:
                    log_helper.warning(
                        __name__,
                        "Profiling of memory usage not available due to missing packages"
                    )
                    log_helper.warning(__name__, e.message)
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Execute the analysis
        try:
            log_helper.debug(__name__,
                             'Analysis arguments: ' +
                             str(self.analysis_arguments),
                             root=self.mpi_root,
                             comm=self.mpi_comm)
            self.workflow_executor.execute()
        except:
            if mpi_helper.get_rank() == self.mpi_root:
                self.remove_output_target()
            raise

        # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial
        # the condition of  mpi_helper.get_rank() ==  self.mpi_root evaluates to True because
        # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial.
        if mpi_helper.get_rank() == self.mpi_root:

            # Print usage profiles if available
            try:
                self.print_time_and_usage_profiles()
            except:
                log_helper.error(
                    __name__,
                    "An error occured while trying to print time and usage profiles",
                    root=self.mpi_root,
                    comm=self.mpi_comm)

            # Print memory profile data if available
            try:
                self.print_memory_profiles()
            except:
                log_helper.error(
                    __name__,
                    "An error occured while trying to print memory profiles",
                    root=self.mpi_root,
                    comm=self.mpi_comm)

            # Print the time it took to run the analysis
            try:
                # Parallel case: We need to compile/collect timing data from all cores
                if isinstance(
                        self.workflow_executor.run_info['execution_time'],
                        list):
                    # Time for each task to execute
                    log_helper.info(
                        __name__,
                        "Time in seconds for each analysis process: " +
                        str(self.workflow_executor.run_info['execution_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)
                    # Start times of each task
                    log_helper.info(
                        __name__,
                        "Time when each of the processes started: " +
                        str(self.workflow_executor.run_info['start_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)
                    # Stop times for each task

                    log_helper.info(
                        __name__,
                        "Time when each of the processes finished: " +
                        str(self.workflow_executor.run_info['end_time']),
                        root=self.mpi_root,
                        comm=self.mpi_comm)

                    # Compile the time to execute string
                    exec_time_array = np.asarray(
                        self.workflow_executor.run_info['execution_time'],
                        dtype=float)
                    max_exec_time = str(exec_time_array.max())
                    min_exec_time = str(exec_time_array.min())
                    mean_exec_time = str(exec_time_array.mean())
                    exec_time_string = max_exec_time + " s " + \
                        "    ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )"
                # Serial case: We only have a single time to worry about
                else:
                    exec_time_string = str(self.workflow_executor.
                                           run_info['execution_time']) + " s"
                log_helper.info(__name__,
                                "Time to execute analysis: " +
                                exec_time_string,
                                root=self.mpi_root,
                                comm=self.mpi_comm)
            except:
                raise

        # Save the analysis to file
        if self.output_target is not None:
            from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager
            for analysis in self.workflow_executor.analysis_tasks:
                omsi_analysis_manager.create_analysis_static(
                    analysis_parent=self.output_target, analysis=analysis)
Ejemplo n.º 47
0
    omsi_format_common, \
    omsi_format_analysis, \
    omsi_format_dependencies
from omsi.dataformat.omsi_file.dependencies import omsi_dependencies_manager
from omsi.dataformat.omsi_file.common import omsi_file_common, omsi_file_object_manager
from omsi.datastructures.run_info_data import run_info_dict
import omsi.shared.mpi_helper as mpi_helper
from omsi.shared.log import log_helper

#try:
#    import cloudpickle   # Use the version of cloud-pickle installed on the system
#    log_helper.debug(__name__, "Using system cloudpickle module")
#except ImportError:
#    try:
import omsi.shared.third_party.cloudpickle as cloudpickle
log_helper.debug(__name__, "Using fallback cloudpickle version")
#    except ImportError:
#        log_helper.warning(__name__, "cloudpickle could not be imported. Using standard pickle instead. " +
#                           " Some features may not be available.")
#        import pickle as cloudpickle
import pickle

# TODO create_analysis_static(...) and other create functions need to handle the case when a file is opened with the MPI I/O backend. Currently we assume a serial write from root

class omsi_analysis_manager(omsi_file_object_manager):
    """
    Analysis manager helper class used to define common functionality needed for analysis-related data.
    Usually, a class that defines a format that contains an omsi_file_analysis object
    will inherit from this class (in addition to omsi_file_common) to acquire the common
    features.
Ejemplo n.º 48
0
    def __init__(self, basename, requires_slicing=True, resolution=5):
        """
        Open an img file for data reading.

        :param basename: The name of the mzml file. If basename is a directory, then the first mzML file found
                             in the directory will be used instead.
        :type basename: string

        :param requires_slicing: Should the complete data be read into memory
                             (this makes slicing easier). (default is True)
        :type requires_slicing: bool

        :param resolution: For profile data only, the minimum m/z spacing to use for creating the "full" reprofiled
                            data cube
        :type resolution: float
        """
        # Determine the correct base
        if os.path.isdir(basename):
            filelist = self.get_files_from_dir(basename)
            if len(filelist) > 0:
                basename = filelist[0]
            else:
                raise ValueError(
                    "No valid mzML file found in the given directory.")
        # self.basename = basename
        # self.requires_slicing = requires_slicing

        # Call super constructor. This sets self.basename and self.readall
        super(xmassmzml_file, self).__init__(basename=basename,
                                             requires_slicing=requires_slicing)
        self.resolution = resolution
        self.data_type = 'uint32'  # TODO What data type should we use for the interpolated data?
        self.num_scans = self.__compute_num_scans(filename=self.basename)
        log_helper.info(__name__,
                        'Read %s scans from mzML file.' % self.num_scans)
        log_helper.debug(__name__, 'Compute coordinates')
        self.coordinates = self.__compute_coordinates(filename=self.basename,
                                                      num_scans=self.num_scans)
        # Compute the spatial configuration of the matrix
        self.x_pos = np.unique(self.coordinates[:, 0])
        self.y_pos = np.unique(self.coordinates[:, 1])
        self.step_size = min(
            [min(np.diff(self.x_pos)),
             min(np.diff(self.y_pos))])

        # Compute the mz axis
        log_helper.debug(__name__, 'Compute mz axes')
        self.mz = self.__compute_mz_axis(filename=self.basename)
        log_helper.debug(__name__, 'mz axes computed')

        # Determine the shape of the dataset, result is a list of shapes for each datacube
        # self.shape_all_data = [(self.x_pos.shape[0], self.y_pos.shape[0], mz.shape[0]) for mz in self.mz_all

        log_helper.debug(__name__, 'Compute shape')
        self.shape = (self.x_pos.shape[0], self.y_pos.shape[0], len(self.mz)
                      )  #self.shape[0])
        # self.shape = None
        # self.mz = None

        # Read the data into memory
        # self.data = None
        log_helper.debug(__name__, 'read all')
        if requires_slicing:
            self.data = self.__read_all()
        log_helper.debug(__name__, 'Finished with init')
Ejemplo n.º 49
0
    def __run_dynamic(self):
        """
        Run the task function using dynamic task scheduling.

        The root rank divides the data into sub-tasks and sends the tasks to available MPI
        processes on request.

        :return: Tuple with the following elements:

            1) List with the results from the local execution of the task_function. Each
               entry is the result from one return of the task_function.
            2) List of block_indexes. Each block_index is a tuple with the selection used to
               divide the data into sub-blocks. In the case of static decomposition we have
               a range slice object along the axes used for decomposition.

        """
        try:
            from omsi.shared.log import log_helper
        except ImportError:
            from pactolus.third_party.log import log_helper
        import time
        rank = get_rank(comm=self.comm)
        size = get_size(comm=self.comm)

        if size < 2:
            warnings.warn('DYNAMIC task scheduling requires at least 2 MPI ranks. Using STATIC scheduling instead.')
            return self.__run_static_1D()

        # We are the controlling rank
        if rank == self.root:
            self.result = []
            self.blocks = []
            self.block_times = []
            # Get data shape parameters and compute the data blocks
            axes_shapes = np.asarray(self.main_data.shape)[self.split_axes]
            total_num_subblocks = np.prod(axes_shapes)
            if total_num_subblocks < size:
                if rank == self.root:
                    warnings.warn("Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle")

            # Compute the list of all possible blocks
            base_blocks = [[slice(None)]] * len(self.main_data.shape)
            for axis_index in self.split_axes:
                base_blocks[axis_index] = range(self.main_data.shape[axis_index])
            block_tuples = itertools.product(*base_blocks)

            # Communicate blocks with task ranks
            log_helper.info(__name__, "PROCESSING DATA BLOCKS")
            start_time = time.time()
            block_index = 0
            for block_selection in block_tuples:
                request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                self.comm.send((block_index, block_selection),
                               dest=request_rank,
                               tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                block_index += 1
                if (block_index % 100) == 0:
                    log_helper.debug(__name__, str((block_index, total_num_subblocks, request_rank)))
            end_time = time.time()
            run_time = end_time - start_time
            log_helper.info(__name__, "TIME FOR SCHEDULING ALL TASKS: " + str(run_time))
            start_time = time.time()
            log_helper.info(__name__, "FINALIZING")
            # Terminate all ranks and receive all data from the different ranks if requested
            all_ranks_status = np.zeros(size, 'bool')
            all_ranks_status[self.root] = True
            while not np.all(all_ranks_status):

                request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                self.comm.send((None, None), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                all_ranks_status[request_rank] = True

            end_time = time.time()
            run_time = end_time - start_time
            log_helper.info(__name__, "TIME FOR FINALIZING TASKS: " + str(run_time))

        # We are a rank that has to run tasks
        else:
            # Request a new data block
            self.result = []
            self.blocks = []
            self.block_times = []
            while True:
                start_time = time.time()
                self.comm.send(rank, dest=self.root, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                block_index, block_selection = self.comm.recv(source=self.root, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                if block_index is None:
                    break
                # Execute the task_function on the given data block
                task_params = self.task_function_params
                task_params[self.main_data_param_name] = self.main_data[block_selection]
                self.result.append(self.task_function(**task_params))
                self.blocks.append(block_selection)
                # Record the timings
                end_time = time.time()
                run_time = end_time - start_time
                self.block_times.append(run_time)

        # Return the result
        return self.result, self.blocks
Ejemplo n.º 50
0
Generic analysis class used to represent analyses of unknown type, e.g., when loading
a custom user-defined analysis from file for which the indicate class may not be
available with the local installation. In this case we want to at least be able
to load and investigate the data.
"""
import pickle

from omsi.analysis.base import analysis_base
from omsi.datastructures.analysis_data import data_dtypes
from omsi.datastructures.dependency_data import dependency_dict
from omsi.datastructures.analysis_data import parameter_data
from omsi.shared.log import log_helper

try:
    import cloudpickle   # Use the version of cloud-pickle installed on the system
    log_helper.debug(__name__, "Using system cloudpickle module")
except ImportError:
    try:
        import omsi.shared.third_party.cloudpickle as cloudpickle
        log_helper.debug(__name__, "Using fallback cloudpickle version")
    except ImportError:
        log_helper.warning(__name__, "cloudpickle could not be imported. Using standard pickle instead. " +
                           " Some features may not be available.")
        import pickle as cloudpickle
import numpy as np


def bastet_analysis(output_names=None, parameter_specs=None, name_key="undefined"):
    """
    Decorator used to wrap a function and replace it with an analysis_generic object
    that behaves like a function but adds the ability for saving the
Ejemplo n.º 51
0
 def clear_parameter_data(self):
     """Clear the list of parameter data"""
     log_helper.debug(__name__, "Clearing parameter data")
     for param in self.parameters:
         param.clear_data()
Ejemplo n.º 52
0
    def __run_dynamic(self):
        """
        Run the task function using dynamic task scheduling.

        The root rank divides the data into sub-tasks and sends the tasks to available MPI
        processes on request.

        :return: Tuple with the following elements:

            1) List with the results from the local execution of the task_function. Each
               entry is the result from one return of the task_function.
            2) List of block_indexes. Each block_index is a tuple with the selection used to
               divide the data into sub-blocks. In the case of static decomposition we have
               a range slice object along the axes used for decomposition.

        """
        from omsi.shared.log import log_helper
        import time
        rank = get_rank(comm=self.comm)
        size = get_size(comm=self.comm)

        if size < 2:
            warnings.warn('DYNAMIC task scheduling requires at least 2 MPI ranks. Using STATIC scheduling instead.')
            return self.__run_static_1D()

        # We are the controlling rank
        if rank == self.root:
            self.result = []
            self.blocks = []
            self.block_times = []
            # Get data shape parameters and compute the data blocks
            axes_shapes = np.asarray(self.main_data.shape)[self.split_axes]
            total_num_subblocks = np.prod(axes_shapes)
            if total_num_subblocks < size:
                if rank == self.root:
                    warnings.warn("Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle")

            # Compute the list of all possible blocks
            base_blocks = [[slice(None)]] * len(self.main_data.shape)
            for axis_index in self.split_axes:
                base_blocks[axis_index] = range(self.main_data.shape[axis_index])
            block_tuples = itertools.product(*base_blocks)

            # Communicate blocks with task ranks
            log_helper.info(__name__, "PROCESSING DATA BLOCKS")
            start_time = time.time()
            block_index = 0
            for block_selection in block_tuples:
                request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                self.comm.send((block_index, block_selection),
                               dest=request_rank,
                               tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                block_index += 1
                if (block_index % 100) == 0:
                    log_helper.debug(__name__, str((block_index, total_num_subblocks, request_rank)))
            end_time = time.time()
            run_time = end_time - start_time
            log_helper.info(__name__, "TIME FOR SCHEDULING ALL TASKS: " + str(run_time))
            start_time = time.time()
            log_helper.info(__name__, "FINALIZING")
            # Terminate all ranks and receive all data from the different ranks if requested
            all_ranks_status = np.zeros(size, 'bool')
            all_ranks_status[self.root] = True
            while not np.all(all_ranks_status):

                request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                self.comm.send((None, None), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                all_ranks_status[request_rank] = True

            end_time = time.time()
            run_time = end_time - start_time
            log_helper.info(__name__, "TIME FOR FINALIZING TASKS: " + str(run_time))

        # We are a rank that has to run tasks
        else:
            # Request a new data block
            self.result = []
            self.blocks = []
            self.block_times = []
            while True:
                start_time = time.time()
                self.comm.send(rank, dest=self.root, tag=self.MPI_MESSAGE_TAGS['RANK_MSG'])
                block_index, block_selection = self.comm.recv(source=self.root, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG'])
                if block_index is None:
                    break
                # Execute the task_function on the given data block
                task_params = self.task_function_params
                task_params[self.main_data_param_name] = self.main_data[block_selection]
                self.result.append(self.task_function(**task_params))
                self.blocks.append(block_selection)
                # Record the timings
                end_time = time.time()
                run_time = end_time - start_time
                self.block_times.append(run_time)

        # Return the result
        return self.result, self.blocks
Ejemplo n.º 53
0
    def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass.

        :returns: A series of numpy arrays  with the score data for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image.

            ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match']
                * 'pixel_index'  , int,  2D array of pixel indices describing for each spectrum \
                   the (x,y) pixel location in the imag
                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        log_helper.debug(__name__,
                         'Reading inputs',
                         comm=self.mpi_comm,
                         root=self.mpi_root)
        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']
        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']
        if precursor_mz == -1:
            precursor_mz = self['fpl_data']['precursor_mz'][:]
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        ms1_mass_tol = self['ms1_mass_tolerance']
        ms2_mass_tol = self['ms2_mass_tolerance']
        neutralizations = self['neutralizations']
        max_depth = self['max_depth']

        # Make the numpy array with the list of tree files and their MS1 masses
        if file_lookup_table is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            log_helper.debug(__name__,
                             'Preparing file lookup table',
                             comm=self.mpi_comm,
                             root=self.mpi_root)
            if os.path.isfile(self['trees']):
                if self['trees'].endswith('.npy'):
                    file_lookup_table = np.load(self['trees'])
                else:
                    in_treefile = open(self['trees'], 'r')
                    tree_files = [line.rstrip('\n') for line in in_treefile]
                    in_treefile.close()
                    file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(
                        tree_files=tree_files)
            elif os.path.isdir(self['trees']):
                file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(
                    path=self['trees'])

        # Define the common pactolus paramters
        pactolus_parameters = {
            'file_lookup_table': file_lookup_table,
            'ms1_mass_tol': ms1_mass_tol,
            'ms2_mass_tol': ms2_mass_tol,
            'neutralizations': neutralizations,
            'max_depth': max_depth
        }

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([
                    spectrum_indexes,
                ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                log_helper.debug(__name__,
                                 'Preparing parallel execution',
                                 comm=self.mpi_comm,
                                 root=self.mpi_root)
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [
                    0,
                ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.
                    execute_analysis,  # Execute this function
                    task_function_params={
                        'file_lookup_table': file_lookup_table
                    },  # Reuse the file_lookup_table
                    main_data=
                    spectrum_indexes,  # Process the spectra independently
                    split_axes=split_axis,  # Split along axes
                    main_data_param_name='spectrum_indexes',  # data input param
                    root=self.mpi_root,  # The root MPI task
                    schedule=self['schedule'],  # Parallel scheduling scheme
                    comm=self.mpi_comm)  # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                pixel_index = np.zeros((0, 2), dtype='int')
                score = np.zeros((0, ), dtype='f4')
                id_data = np.zeros((0, ), dtype='a100')
                name = np.zeros((0, ), dtype='a100')
                mass = np.zeros((0, ), dtype='f4')
                n_peaks = np.zeros((0, ), dtype='i4')
                n_match = np.zeros((0, ), dtype='i4')

                use_dynamic_schedule = (
                    self['schedule'] ==
                    mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank()
                                            == self.mpi_root
                                            and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    log_helper.debug(__name__, 'Compiling output')
                    # Compile pixel_index
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                    temp_data = [ri[1] for ri in result[0]]
                    # Compile scores
                    if len(temp_data) > 0:
                        score = np.concatenate(tuple(temp_data), axis=0)
                    # Compile id
                    temp_data = [ri[2] for ri in result[0]]
                    if len(temp_data) > 0:
                        id_data = np.concatenate(tuple(temp_data), axis=0)
                    # Compile name
                    temp_data = [ri[3] for ri in result[0]]
                    if len(temp_data) > 0:
                        name = np.concatenate(tuple(temp_data), axis=0)
                    # Compile mass
                    temp_data = [ri[4] for ri in result[0]]
                    if len(temp_data) > 0:
                        mass = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_peaks
                    temp_data = [ri[5] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_peaks = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_match
                    temp_data = [ri[6] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_match = np.concatenate(tuple(temp_data), axis=0)
                    log_helper.log_var(__name__, score=score)
                # Return the compiled output
                return pixel_index, score, id_data, name, mass, n_peaks, n_match

        #############################################################
        # Serial processing of the current data block
        #############################################################
        log_helper.debug(__name__,
                         'Processing spectra',
                         comm=self.mpi_comm,
                         root=self.mpi_root)
        # Initialize the output data structures
        # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        # if len(pixel_index.shape) == 1:
        #    pixel_index = pixel_index[np.newaxis, :]
        hit_matrix = []

        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = int(fpl_peak_arrayindex[spectrum_index, 2])
            stop = int(fpl_peak_arrayindex[(spectrum_index + 1),
                                           2] if spectrum_index <
                       (num_spectra - 1) else fpl_peak_value.size)
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                           str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                log_helper.info(__name__,
                                time_str,
                                comm=self.mpi_comm,
                                root=None)
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 2),
                                          dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = precursor_mz if len(
                precursor_mz) == 1 else precursor_mz[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = score_frag_dag.score_scan_list_against_trees(
                scan_list=[
                    current_peaks_list,
                ],
                ms1_mz=[
                    current_parent_mass,
                ],
                params=pactolus_parameters)
            end_time = time.time()
            execution_time = end_time - start_time
            time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                       str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str((current_hits > 0).sum())
            #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
            #sys.stdout.flush()
            print time_str
            sys.stdout.flush()

            # Save the hits for the current pixel
            hit_matrix.append(current_hits[0, :])

        # Index the results based on the given metabolite database
        score = []
        id_data = []
        name = []
        mass = []
        n_peaks = []
        n_match = []
        pixel_index = []
        if len(metabolite_database) > 0:  # We don't have an empty string
            for current_index, spectrum_index in enumerate(spectrum_indexes):
                non_zero_scores = np.where(hit_matrix[current_index] > 0)
                if non_zero_scores.size > 0:
                    current_hit_table = np.asarray(
                        score_frag_dag.make_pactolus_hit_table(
                            pactolus_results=hit_matrix[current_index],
                            table_file=file_lookup_table,
                            original_db=metabolite_database))
                    for score_index in non_zero_scores:
                        pixel_index.append(fpl_peak_arrayindex[spectrum_index,
                                                               0:2])
                        score.append(current_hit_table['score'][score_index])
                        id_data.append(current_hit_table['id'][score_index])
                        name.append(current_hit_table['name'][score_index])
                        mass.append(current_hit_table['mass'][score_index])
                        n_peaks.append(
                            current_hit_table['n_peaks'][score_index])
                        n_match.append(
                            current_hit_table['n_match'][score_index])
        else:
            pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
            score = np.asarray(hit_matrix)

        # Return the hit_table and the index of the pixel each hit_table applies to
        print "rank : " + str(
            mpi_helper.get_rank()) + " : scores " + str(score)
        sys.stdout.flush()
        return np.asarray(pixel_index), \
               np.asarray(score), \
               np.asarray(id_data), \
               np.asarray(name), \
               np.asarray(mass), \
               np.asarray(n_peaks), \
               np.asarray(n_match)