def log(cls, module_name, message, root=0, comm=None, level=None, *args, **kwargs): """ Convenience function used to select the log message level using an input parameter rathern than by selecting the approbriate function. :param module_name: __name__ of the calling module or None in case the ROOT logger should be used. :param message: The message to be added to the log :param root: The root process to be used for output when running in parallel. If None, then all calling ranks will perform logging. Default is 0. :param comm: The MPI communicator to be used to determine the MPI rank. None by default, in which case mpi.comm_world is used. :param level: To which logging level should we send the message :param args: Additional positional arguments for the python logger.debug function. See the python docs. :param kwargs: Additional keyword arguments for the python logger.debug function. See the python docs. """ if level is None: level = log_helper.log_levels['INFO'] if level in log_helper.log_levels.keys(): level = log_helper.log_levels[level] if root is None or root == mpi_helper.get_rank(comm=comm): cls.get_logger(module_name).log(level, message, *args, **kwargs)
def main(): """ Main function """ sys.path.append(os.path.dirname(__file__)) # Parse the command line arguments cl_params = command_line_params() if mpi_helper.get_rank() == 0: print_arguments(cl_params) # Create the file parameter for the FragTreeLibrary file_params = { 'input_inchi_file': cl_params['inchi_file'], 'output_directory': cl_params['output_dir'], 'output_hdf5_file_base': cl_params['output_base_name'], 'output_error_log': cl_params['error_log'] } isotope_dict = get_isotope_dict(isostope_file=cl_params['isotope_file']) # Generate the fragmentation tree FragTreeLibrary(max_depth=cl_params['max_depth'], isotope_dict=isotope_dict, file_params=file_params) return
def parse_cl_arguments(self): """ The function assumes that the command line parser has been setup using the initialize_argument_parser(..) This function parses all arguments that are specific to the command-line parser itself. Analysis arguments are added and parsed later by the add_and_parse_analysis_arguments(...) function. The reason for this is two-fold: i) to separate the parsing of analysis arguments and arguments of the command-line driver and ii) if the same HDF5 file is used as input and output target, then we need to open it first here in append mode before it gets opened in read mode later by the arguments. *Side effects:* The function sets ``self.output_target`` and ``self.profile_analysis`` """ # Parse the arguments and convert them to a dict using vars parsed_arguments = vars(self.parser.parse_known_args()[0]) # Clean up the arguments to remove default arguments of the driver class # before we hand the arguments to the analysis class if self.analysis_class_arg_name in parsed_arguments: parsed_arguments.pop(self.analysis_class_arg_name) # Process the --save argument to determine where we should save the output if self.output_save_arg_name in parsed_arguments and mpi_helper.get_rank() == self.mpi_root: # Determine the filename and experiment group from the path self.output_target = parsed_arguments.pop(self.output_save_arg_name) if self.output_target is not None: output_filename, output_object_path = omsi_file_common.parse_path_string(self.output_target) # Create the output file if output_filename is None: raise ValueError("ERROR: Invalid save parameter specification " + self.output_target) elif os.path.exists(output_filename) and not os.path.isfile(output_filename): raise ValueError("ERROR: Save parameter not specify a file.") if not os.path.exists(output_filename): out_file = omsi_file(output_filename, mode='a') self.output_target = out_file.create_experiment() self. __output_target_self = output_filename else: out_file = omsi_file(output_filename, mode='r+') if output_object_path is not None: self.output_target = omsi_file_common.get_omsi_object(out_file[output_object_path]) else: if out_file.get_num_experiments() > 0: self.output_target = out_file.get_experiment(0) else: self.output_target = out_file.create_experiment() else: self.output_target = parsed_arguments.pop(self.output_save_arg_name) # The --loglovel argument if self.log_level_arg_name in parsed_arguments: user_log_level = parsed_arguments.pop(self.log_level_arg_name) if user_log_level in log_helper.log_levels.keys(): log_helper.set_log_level(level=log_helper.log_levels[user_log_level]) else: log_helper.error(module_name=__name__, message="Invalid log level specified")
def print_arguments(cl_params): """ Print the settings from the dict with command line arguments :param cl_params: Dict with the command line arguments """ print "" print "Settings:" print "---------" if mpi_helper.get_rank() == 0: for param_key, param_val in cl_params.iteritems(): print str(param_key) + " = " + str(param_val) print ""
def exception(cls, module_name, message, root=0, comm=None, *args, **kwargs): """ Create a exception log entry. This function is typically called as: log_helper.exception(module_name=__name__, message="your message") :param module_name: __name__ of the calling module or None in case the ROOT logger should be used. :param message: The message to be added to the log :param root: The root process to be used for output when running in parallel. If None, then all calling ranks will perform logging. Default is 0. :param comm: The MPI communicator to be used to determin the MPI rank. None by default, in which case mpi.comm_world is used. :param args: Additional positional arguments for the python logger.debug function. See the python docs. :param kwargs: Additional keyword arguments for the python logger.debug function. See the python docs. """ if root is None or root == mpi_helper.get_rank(comm=comm): cls.get_logger(module_name).exception(message, *args, **kwargs)
def error(cls, module_name, message, root=0, comm=None, *args, **kwargs): """ Create a error log entry. This function is typically called as: log_helper.error(module_name=__name__, message="your message") :param module_name: __name__ of the calling module or None in case the ROOT logger should be used. :param message: The message to be added to the log :param root: The root process to be used for output when running in parallel. If None, then all calling ranks will perform logging. Default is 0. :param comm: The MPI communicator to be used to determin the MPI rank. None by default, in which case mpi.comm_world is used. :param args: Additional positional arguments for the python logger.debug function. See the python docs. :param kwargs: Additional keyword arguments for the python logger.debug function. See the python docs. """ if root is None or root == mpi_helper.get_rank(comm=comm): cls.get_logger(module_name).error(message, *args, **kwargs)
def create_analysis_static(analysis_parent, analysis, flush_io=True, force_save=False, save_unsaved_dependencies=True, mpi_root=0, mpi_comm=None): """ Same as create_analysis(...) but instead of relying on object-level, this function allows additional parameters (specifically the analysis_parent) to be provided as input, rather than being determined based on self :param analysis_parent: The h5py.Group object or omsi.dataformat.omsi_file.common.omsi_file_common object where the analysis should be created :param kwargs: Additional keyword arguments for create_analysis(...). See create_analysis(...) for details. :return: The output of create_analysis """ if mpi_helper.get_rank(comm=mpi_comm) == mpi_root: if isinstance(analysis_parent, h5py.Group): parent_group = analysis_parent elif isinstance(analysis_parent, omsi_file_common): parent_group = analysis_parent.managed_group else: log_helper.error( __name__, 'Illegal analysis_parent type. Expected h5py.Group or omsi_file_common' ) raise ValueError("Illegal value for analysis parent") return omsi_file_analysis.__create__( parent_group=parent_group, analysis=analysis, analysis_index=None, # Same as self.get_num_analysis() flush_io=flush_io, force_save=force_save, save_unsaved_dependencies=save_unsaved_dependencies) else: try: analysis.write_analysis_data() return None except NotImplementedError: pass
def log(cls, module_name, message, root=0, comm=None, level=None, *args, **kwargs): """ Convenience function used to select the log message level using an input parameter rathern than by selecting the approbriate function. :param module_name: __name__ of the calling module or None in case the ROOT logger should be used. :param message: The message to be added to the log :param root: The root process to be used for output when running in parallel. If None, then all calling ranks will perform logging. Default is 0. :param comm: The MPI communicator to be used to determine the MPI rank. None by default, in which case mpi.comm_world is used. :param level: To which logging level should we send the message :param args: Additional positional arguments for the python logger.debug function. See the python docs. :param kwargs: Additional keyword arguments for the python logger.debug function. See the python docs. """ if level is None: level = log_helper.log_levels['INFO'] if level in log_helper.log_levels.keys(): level = log_helper.log_levels[level] if root is None or root == mpi_helper.get_rank(comm=comm): cls.get_logger(module_name).log(level, message, *args, **kwargs)
def create_analysis_static(analysis_parent, analysis, flush_io=True, force_save=False, save_unsaved_dependencies=True, mpi_root=0, mpi_comm=None): """ Same as create_analysis(...) but instead of relying on object-level, this function allows additional parameters (specifically the analysis_parent) to be provided as input, rather than being determined based on self :param analysis_parent: The h5py.Group object or omsi.dataformat.omsi_file.common.omsi_file_common object where the analysis should be created :param kwargs: Additional keyword arguments for create_analysis(...). See create_analysis(...) for details. :return: The output of create_analysis """ if mpi_helper.get_rank(comm=mpi_comm) == mpi_root: if isinstance(analysis_parent, h5py.Group): parent_group = analysis_parent elif isinstance(analysis_parent, omsi_file_common): parent_group = analysis_parent.managed_group else: log_helper.error(__name__, 'Illegal analysis_parent type. Expected h5py.Group or omsi_file_common') raise ValueError("Illegal value for analysis parent") return omsi_file_analysis.__create__(parent_group=parent_group, analysis=analysis, analysis_index=None, # Same as self.get_num_analysis() flush_io=flush_io, force_save=force_save, save_unsaved_dependencies=save_unsaved_dependencies) else: try: analysis.write_analysis_data() return None except NotImplementedError: pass
def write_analysis_data(self, analysis_group=None): """ This function is used to write the actual analysis data to file. If not implemented, then the omsi_file_analysis API's default behavior is used instead. :param analysis_group: The h5py.Group object where the analysis is stored. """ # Check if a user attempts to do parallel I/O with collect being disabled if mpi_helper.get_size() > 1 and not self['collect']: # Check if any of the other ranks have data num_elements = self['peak_arrayindex'].shape[0] if len( self['peak_arrayindex'].shape) == 2 else 0 result_sizes = mpi_helper.gather(num_elements, comm=self.mpi_comm, root=self.mpi_root) if mpi_helper.get_rank() == self.mpi_root: for element_size in result_sizes[1:]: if element_size > 0: raise ValueError( 'Parallel I/O with collect parameter set to false not supported' ) raise NotImplementedError """
def execute_analysis(self, spectrum_indexes=None, compound_list=None): """ Execute the local peak finder for the given msidata. :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra that should be processed by this MPI task. If spectrum_indexes is set, then the given subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :param compound_list: List of the compounds from the database file. This parameter is used to avoid having to read the compound database on every compute task that calls this function when running in parallel. This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :returns: A tuple with an array of hit_tables with the scores for each pixel and a 2D array of pixel indices describing for each spectrum the (x,y) pixel location in the image. The hit_table is an array of (#spectra x #compounds). The hit_table is a structured numpy array with the following columns: * 'score', float, MIDAS score of row * 'id', str, database ID e.g. 'MetaCyC_7884' * 'name', str, database name, e.g. 'glycine' * 'mass', float, mass in Da of IDed compound * 'n_peaks', int, number of peaks in data * 'n_match', int, number of peaks in data matched """ # Assign parameter settings to local variables for convenience metabolite_database = self['metabolite_database'] precursor_type = self['precursor_type'] parent_mass_windows = self['parent_mass_windows'] positive_ion_fragment_mass_windows = self['positive_ion_fragment_mass_windows'] negative_ion_fragment_mass_windows = self['negative_ion_fragment_mass_windows'] mass_tolerance_parent_ion = self['mass_tolerance_parent_ion'] mass_tolerance_fragment_ions = self['mass_tolerance_fragment_ions'] break_rings = self['break_rings'] fragmentation_depth = self['fragmentation_depth'] # Calculate the parent_mass precursor_mz = self['precursor_mz'] # FIXME Get the precursor_mz from the MS2 data if precursor_mz == -1: precursor_mz = self['fpl_data']['precursor_mz'][:] default_charge = self['default_charge'] # FIXME Is this an input or should we get this from file proton_mass = 1.00782503207 - 5.4857990946e-4 parent_mass = precursor_mz - (default_charge * proton_mass) # Get the data we need to process fpl_data = self['fpl_data'] fpl_peak_mz = fpl_data['peak_mz'] fpl_peak_value = fpl_data['peak_value'] fpl_peak_arrayindex = fpl_data['peak_arrayindex'] # Get the compound list if we have not read it previously. if compound_list is None: # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all compound_list = MIDAS.ReadCompoundFile(metabolite_database) # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array # where we can find the spectrum that we need to processes num_spectra = fpl_peak_arrayindex.shape[0] if spectrum_indexes is None: # Get the complete peak array index data spectrum_indexes = np.arange(0, num_spectra) enable_parallel = True else: if isinstance(spectrum_indexes, int): spectrum_indexes = np.asarray([spectrum_indexes, ]) enable_parallel = False ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if enable_parallel: # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = [0, ] scheduler = mpi_helper.parallel_over_axes( task_function=self.execute_analysis, # Execute this function task_function_params={'compound_list': compound_list}, # Reuse the compound_list main_data=spectrum_indexes, # Process the spectra independently split_axes=split_axis, # Split along axes main_data_param_name='spectrum_indexes', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel scheduling scheme comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # Compile the data from the parallel execution hit_table = np.zeros((0, 0), dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE) # initialize hit_table as empty pixel_index = np.zeros((0, 2), dtype='int') use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # TODO NEED to update since collect now returns a single list not a list of lists if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule): # We did not process any data on the root process when using dynamic scheduling # and we did not collect the data to the root either pass #elif self['collect'] and mpi_helper.get_rank() == self.mpi_root: # temp_data = [ri[0] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # hit_table = np.concatenate(tuple(temp_data), axis=-1) # temp_data = [ri[1] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1 else: temp_data = [ri[0] for ri in result[0]] if len(temp_data) > 0: hit_table = np.concatenate(tuple(temp_data), axis=-1) temp_data = [ri[1] for ri in result[0]] if len(temp_data) > 0: pixel_index = np.concatenate(tuple(temp_data), axis=0) return hit_table, pixel_index ############################################################# # Serial processing of the current data block ############################################################# # Initialize the output data structures pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] if len(pixel_index.shape) == 1: pixel_index = pixel_index[np.newaxis, :] hit_table = None # FIXME The initalization of the hit_table is only valid if we assume that all spectra have the same precursor m/z, which may not be the case # Iterate through all the pixel we were asked to process in serial for current_index, spectrum_index in enumerate(spectrum_indexes): # Determine the start and stop index for the m/z and intensity data of the current spectrum start = fpl_peak_arrayindex[spectrum_index, 2] stop = fpl_peak_arrayindex[(spectrum_index+1), 2] \ if spectrum_index < (num_spectra-1) \ else fpl_peak_value.size spectrum_length = stop - start # Skip empty spectra if spectrum_length == 0: time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored." print time_str continue # Load the m/z and intensity values for the current spectrum current_peaks_list = np.zeros(shape=(spectrum_length, 3), dtype=float) current_peaks_list[:, 0] = fpl_peak_mz[start:stop] current_peaks_list[:, 1] = fpl_peak_value[start:stop] # Get the parent mass current_parent_mass = parent_mass if len(parent_mass) == 1 else parent_mass[spectrum_index] start_time = time.time() # Call MIDAS to score the current spectrum against all compounds in the database current_hits = MIDAS.scoring_C.score_main( Compound_list=compound_list, bBreakRing=break_rings, dCurrentPrecursor_type=precursor_type, dCurrentParentMass=current_parent_mass, current_peaks_list=current_peaks_list, iParentMassWindow_list=parent_mass_windows, dMass_Tolerance_Parent_Ion=mass_tolerance_parent_ion, dMass_Tolerance_Fragment_Ions=mass_tolerance_fragment_ions, iFragmentation_Depth=fragmentation_depth, iPositive_Ion_Fragment_Mass_Windows_list=positive_ion_fragment_mass_windows, iNegative_Ion_Fragment_Mass_Windows_list=negative_ion_fragment_mass_windows, top_n=None) end_time = time.time() execution_time = end_time - start_time time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time) time_str += " : num hits : " + str(current_hits.shape[0]) print time_str sys.stdout.flush() # Initialize the hit_table if necessary if hit_table is None: # If our compound database does not contain any related compounds then just finish if current_hits.shape[0] == 0: # Initialize the results as empty and finish as there is nothing to do hit_table = np.zeros(shape=(pixel_index.shape[0], 0), dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE) # FIXME the number of hits may be different for different spectra if we have varying precursor m/z continue # If our compound database contains at least one relevant compound then check all spectra else: # Create the data structure to store all results hit_table = np.zeros(shape=(pixel_index.shape[0], current_hits.shape[0]), dtype=current_hits.dtype) # FIXME the number of hits may be different for different spectra if we have varying precursor m/z # Save the hits for the current pixel hit_table[current_index] = current_hits if hit_table is None: hit_table = np.zeros(shape=(pixel_index.shape[0], 0), dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE) # Return the hit_table and the index of the pixel each hit_table applies to return hit_table, pixel_index
def main(self): """ Default main function for running an analysis from the command line. The default implementation exposes all specified analysis parameters as command line options to the user. The default implementation also provides means to print a help text for the function. :raises: ValueError is raised in case that the analysis class is unknown """ # Get the analysis object if needed if self.add_analysis_class_arg: try: self.get_analysis_class_from_cl() except (ImportError, AttributeError, ValueError): pass # Initialize the argument parser if self.parser is None: self.initialize_argument_parser() # Check if we have a valid analysis class if self.analysis_class is None: print self.parser.print_help() raise ValueError('Could not determine the analysis class.') if not issubclass(self.analysis_class, analysis_base): print self.parser.print_help() raise ValueError('Analysis class is not a subclass of analysis_base.') try: # Parse the command line arguments to determine the command line driver settings self.parse_cl_arguments() # Add and parse the command line arguments specific to the analysis to determine the analysis settings self.add_and_parse_analysis_arguments() except: self.remove_output_target() raise # Print the analysis settings if mpi_helper.get_rank() == self.mpi_root: self.print_settings() # Call the execute function of the analysis try: # Create the analysis object if self.analysis_object is None: self.create_analysis_object() # Execute the analysis log_helper.debug(__name__, 'Analysis arguments: ' + str(self.analysis_arguments), root=self.mpi_root, comm=self.mpi_comm) self.analysis_object.execute(**self.analysis_arguments) except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial # the condition of mpi_helper.get_rank() == self.mpi_root evaluates to True because # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial. if mpi_helper.get_rank() == self.mpi_root: # Print the profiling results of time and usage if self.analysis_object['profile_time_and_usage']: print "" print "PROFILING DATA: TIME AND USAGE" print "" self.analysis_object.get_profile_stats_object(consolidate=True).print_stats() # Print the profiling results for memory usage if self.analysis_object['profile_memory']: print "" print "PROFILING DATA: MEMORY" print "" print self.analysis_object.get_memory_profile_info() # Print the time it took to run the analysis try: # Parallel case: We need to compile/collect timing data from all cores if isinstance(self.analysis_object.run_info['execution_time'], list): # Time for each task to execute log_helper.info(__name__, "Time in seconds for each analysis process: " + str(self.analysis_object.run_info['execution_time']), root=self.mpi_root, comm=self.mpi_comm) # Start times of each task log_helper.info(__name__, "Time when each of the processes started: " + str(self.analysis_object.run_info['start_time']), root=self.mpi_root, comm=self.mpi_comm) # Stop times for each task log_helper.info(__name__, "Time when each of the processes finished: " + str(self.analysis_object.run_info['end_time']), root=self.mpi_root, comm=self.mpi_comm) # Compile the time to execute string exec_time_array = np.asarray(self.analysis_object.run_info['execution_time'], dtype=float) max_exec_time = str(exec_time_array.max()) min_exec_time = str(exec_time_array.min()) mean_exec_time = str(exec_time_array.mean()) exec_time_string = max_exec_time + " s " + \ " ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )" # Serial case: We only have a single time to worry about else: exec_time_string = str(self.analysis_object.run_info['execution_time']) + " s" log_helper.info(__name__, "Time to execute analysis: " + exec_time_string, root=self.mpi_root, comm=self.mpi_comm) except: raise # Save the analysis to file if self.output_target is not None: from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager omsi_analysis_manager.create_analysis_static(analysis_parent=self.output_target, analysis=self.analysis_object)
def main(self): """ Default main function for running an analysis from the command line. The default implementation exposes all specified analysis parameters as command line options to the user. The default implementation also provides means to print a help text for the function. :raises: ValueError is raised in case that the analysis class is unknown """ # Initialize the argument parser if self.parser is None: self.initialize_argument_parser() try: # Parse the command line arguments to determine the command line driver settings self.parse_cl_arguments() except: self.remove_output_target() raise if self.workflow_executor is None: self.remove_output_target() log_helper.error( __name__, 'Missing --script parameter or worfklow_executor object') raise ValueError('Workflow not initalized') # Add and parse the command line arguments specific to the analysis to determine the analysis settings try: self.add_and_parse_workflow_arguments() except: self.remove_output_target() raise # Print the analysis settings if mpi_helper.get_rank() == self.mpi_root: self.print_settings() # Enable time and usage profiling try: # Enable time and usage profiling if requested if self.profile_analyses: try: self.workflow_executor.analysis_tasks.enable_time_and_usage_profiling( self.profile_analyses) except ImportError as e: log_helper.warning( __name__, "Profiling of time and usage not available due to missing packages." ) log_helper.warning(__name__, e.message) # Enable memory profiling if requested if self.profile_analyses_mem: try: self.workflow_executor.analysis_tasks.enable_memory_profiling( self.profile_analyses_mem) except ImportError as e: log_helper.warning( __name__, "Profiling of memory usage not available due to missing packages" ) log_helper.warning(__name__, e.message) except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Execute the analysis try: log_helper.debug(__name__, 'Analysis arguments: ' + str(self.analysis_arguments), root=self.mpi_root, comm=self.mpi_comm) self.workflow_executor.execute() except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial # the condition of mpi_helper.get_rank() == self.mpi_root evaluates to True because # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial. if mpi_helper.get_rank() == self.mpi_root: # Print usage profiles if available try: self.print_time_and_usage_profiles() except: log_helper.error( __name__, "An error occured while trying to print time and usage profiles", root=self.mpi_root, comm=self.mpi_comm) # Print memory profile data if available try: self.print_memory_profiles() except: log_helper.error( __name__, "An error occured while trying to print memory profiles", root=self.mpi_root, comm=self.mpi_comm) # Print the time it took to run the analysis try: # Parallel case: We need to compile/collect timing data from all cores if isinstance( self.workflow_executor.run_info['execution_time'], list): # Time for each task to execute log_helper.info( __name__, "Time in seconds for each analysis process: " + str(self.workflow_executor.run_info['execution_time']), root=self.mpi_root, comm=self.mpi_comm) # Start times of each task log_helper.info( __name__, "Time when each of the processes started: " + str(self.workflow_executor.run_info['start_time']), root=self.mpi_root, comm=self.mpi_comm) # Stop times for each task log_helper.info( __name__, "Time when each of the processes finished: " + str(self.workflow_executor.run_info['end_time']), root=self.mpi_root, comm=self.mpi_comm) # Compile the time to execute string exec_time_array = np.asarray( self.workflow_executor.run_info['execution_time'], dtype=float) max_exec_time = str(exec_time_array.max()) min_exec_time = str(exec_time_array.min()) mean_exec_time = str(exec_time_array.mean()) exec_time_string = max_exec_time + " s " + \ " ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )" # Serial case: We only have a single time to worry about else: exec_time_string = str(self.workflow_executor. run_info['execution_time']) + " s" log_helper.info(__name__, "Time to execute analysis: " + exec_time_string, root=self.mpi_root, comm=self.mpi_comm) except: raise # Save the analysis to file if self.output_target is not None: from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager for analysis in self.workflow_executor.analysis_tasks: omsi_analysis_manager.create_analysis_static( analysis_parent=self.output_target, analysis=analysis)
def parse_cl_arguments(self): """ The function assumes that the command line parser has been setup using the initialize_argument_parser(..) This function parses all arguments that are specific to the command-line parser itself. Analysis workflow arguments are added and parsed later by the add_and_parse_workflow_arguments(...) function. The reason for this is two-fold: i) to separate the parsing of analysis arguments and arguments of the command-line driver and ii) if the same HDF5 file is used as input and output target, then we need to open it first here in append mode before it gets opened in read mode later by the arguments. *Side effects:* The function sets: - ``self.output_target`` - ``self.profile_analyses`` """ # Parse the arguments and convert them to a dict using vars parsed_arguments = vars(self.parser.parse_known_args()[0]) # Process the --save argument to determine where we should save the output if self.output_save_arg_name in parsed_arguments and mpi_helper.get_rank( ) == self.mpi_root: # Determine the filename and experiment group from the path self.output_target = parsed_arguments.pop( self.output_save_arg_name) if self.output_target is not None: output_filename, output_object_path = omsi_file_common.parse_path_string( self.output_target) # Create the output file if output_filename is None: raise ValueError( "ERROR: Invalid save parameter specification " + self.output_target) elif os.path.exists(output_filename ) and not os.path.isfile(output_filename): raise ValueError( "ERROR: Save parameter not specify a file.") if not os.path.exists(output_filename): out_file = omsi_file(output_filename, mode='a') self.output_target = out_file.create_experiment() self.__output_target_self = output_filename else: out_file = omsi_file(output_filename, mode='r+') if output_object_path is not None: self.output_target = omsi_file_common.get_omsi_object( out_file[output_object_path]) else: if out_file.get_num_experiments() > 0: self.output_target = out_file.get_experiment(0) else: self.output_target = out_file.create_experiment() else: self.output_target = parsed_arguments.pop( self.output_save_arg_name) # Process the --profile profiling argument if self.profile_arg_name in parsed_arguments: self.profile_analyses = parsed_arguments.pop(self.profile_arg_name) # Process the --memprofile argument if self.profile_mem_arg_name in parsed_arguments: self.profile_analyses_mem = parsed_arguments.pop( self.profile_mem_arg_name) # The --loglevel argument if self.log_level_arg_name in parsed_arguments: self.user_log_level = parsed_arguments.pop(self.log_level_arg_name) if self.user_log_level in log_helper.log_levels.keys(): log_helper.set_log_level( level=log_helper.log_levels[self.user_log_level]) else: self.user_log_level = None log_helper.error(module_name=__name__, message="Invalid log level specified") # The --script arguments if self.script_arg_name in parsed_arguments: self.script_files = parsed_arguments.pop(self.script_arg_name) if self.workflow_executor is None: self.create_workflow_executor_object() else: self.workflow_executor.add_analysis_from_scripts( script_files=self.script_files)
def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None): """ Execute the local peak finder for the given msidata. :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra that should be processed by this MPI task. If spectrum_indexes is set, then the given subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass. :returns: A series of numpy arrays with the score data for each pixel and a 2D array of pixel indices describing for each spectrum the (x,y) pixel location in the image. ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match'] * 'pixel_index' , int, 2D array of pixel indices describing for each spectrum \ the (x,y) pixel location in the imag * 'score', float, MIDAS score of row * 'id', str, database ID e.g. 'MetaCyC_7884' * 'name', str, database name, e.g. 'glycine' * 'mass', float, mass in Da of IDed compound * 'n_peaks', int, number of peaks in data * 'n_match', int, number of peaks in data matched """ log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root) # Get the data we need to process fpl_data = self['fpl_data'] fpl_peak_mz = fpl_data['peak_mz'] fpl_peak_value = fpl_data['peak_value'] fpl_peak_arrayindex = fpl_data['peak_arrayindex'] # Calculate the parent_mass precursor_mz = self['precursor_mz'] if precursor_mz == -1: precursor_mz = self['fpl_data']['precursor_mz'][:] # Assign parameter settings to local variables for convenience metabolite_database = self['metabolite_database'] ms1_mass_tol = self['ms1_mass_tolerance'] ms2_mass_tol = self['ms2_mass_tolerance'] neutralizations = self['neutralizations'] max_depth = self['max_depth'] # Make the numpy array with the list of tree files and their MS1 masses if file_lookup_table is None: # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root) if os.path.isfile(self['trees']): if self['trees'].endswith('.npy'): file_lookup_table = np.load(self['trees']) else: in_treefile = open(self['trees'], 'r') tree_files = [line.rstrip('\n') for line in in_treefile] in_treefile.close() file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass( tree_files=tree_files) elif os.path.isdir(self['trees']): file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass( path=self['trees']) # Define the common pactolus paramters pactolus_parameters = { 'file_lookup_table': file_lookup_table, 'ms1_mass_tol': ms1_mass_tol, 'ms2_mass_tol': ms2_mass_tol, 'neutralizations': neutralizations, 'max_depth': max_depth } # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array # where we can find the spectrum that we need to processes num_spectra = fpl_peak_arrayindex.shape[0] if spectrum_indexes is None: # Get the complete peak array index data spectrum_indexes = np.arange(0, num_spectra) enable_parallel = True else: if isinstance(spectrum_indexes, int): spectrum_indexes = np.asarray([ spectrum_indexes, ]) enable_parallel = False ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if enable_parallel: log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root) # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = [ 0, ] scheduler = mpi_helper.parallel_over_axes( task_function=self. execute_analysis, # Execute this function task_function_params={ 'file_lookup_table': file_lookup_table }, # Reuse the file_lookup_table main_data= spectrum_indexes, # Process the spectra independently split_axes=split_axis, # Split along axes main_data_param_name='spectrum_indexes', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel scheduling scheme comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # Compile the data from the parallel execution pixel_index = np.zeros((0, 2), dtype='int') score = np.zeros((0, ), dtype='f4') id_data = np.zeros((0, ), dtype='a100') name = np.zeros((0, ), dtype='a100') mass = np.zeros((0, ), dtype='f4') n_peaks = np.zeros((0, ), dtype='i4') n_match = np.zeros((0, ), dtype='i4') use_dynamic_schedule = ( self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # TODO NEED to update since collect now returns a single list not a list of lists if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule): # We did not process any data on the root process when using dynamic scheduling # and we did not collect the data to the root either pass # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root: # temp_data = [ri[0] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # hit_table = np.concatenate(tuple(temp_data), axis=-1) # temp_data = [ri[1] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1 else: log_helper.debug(__name__, 'Compiling output') # Compile pixel_index temp_data = [ri[0] for ri in result[0]] if len(temp_data) > 0: pixel_index = np.concatenate(tuple(temp_data), axis=0) temp_data = [ri[1] for ri in result[0]] # Compile scores if len(temp_data) > 0: score = np.concatenate(tuple(temp_data), axis=0) # Compile id temp_data = [ri[2] for ri in result[0]] if len(temp_data) > 0: id_data = np.concatenate(tuple(temp_data), axis=0) # Compile name temp_data = [ri[3] for ri in result[0]] if len(temp_data) > 0: name = np.concatenate(tuple(temp_data), axis=0) # Compile mass temp_data = [ri[4] for ri in result[0]] if len(temp_data) > 0: mass = np.concatenate(tuple(temp_data), axis=0) # Compile n_peaks temp_data = [ri[5] for ri in result[0]] if len(temp_data) > 0: n_peaks = np.concatenate(tuple(temp_data), axis=0) # Compile n_match temp_data = [ri[6] for ri in result[0]] if len(temp_data) > 0: n_match = np.concatenate(tuple(temp_data), axis=0) log_helper.log_var(__name__, score=score) # Return the compiled output return pixel_index, score, id_data, name, mass, n_peaks, n_match ############################################################# # Serial processing of the current data block ############################################################# log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root) # Initialize the output data structures # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] # if len(pixel_index.shape) == 1: # pixel_index = pixel_index[np.newaxis, :] hit_matrix = [] # Iterate through all the pixel we were asked to process in serial for current_index, spectrum_index in enumerate(spectrum_indexes): # Determine the start and stop index for the m/z and intensity data of the current spectrum start = int(fpl_peak_arrayindex[spectrum_index, 2]) stop = int(fpl_peak_arrayindex[(spectrum_index + 1), 2] if spectrum_index < (num_spectra - 1) else fpl_peak_value.size) spectrum_length = stop - start # Skip empty spectra if spectrum_length == 0: time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored." log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) continue # Load the m/z and intensity values for the current spectrum current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float) current_peaks_list[:, 0] = fpl_peak_mz[start:stop] current_peaks_list[:, 1] = fpl_peak_value[start:stop] # Get the parent mass current_parent_mass = precursor_mz if len( precursor_mz) == 1 else precursor_mz[spectrum_index] start_time = time.time() # Call MIDAS to score the current spectrum against all compounds in the database current_hits = score_frag_dag.score_scan_list_against_trees( scan_list=[ current_peaks_list, ], ms1_mz=[ current_parent_mass, ], params=pactolus_parameters) end_time = time.time() execution_time = end_time - start_time time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time) time_str += " : num hits : " + str((current_hits > 0).sum()) #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) #sys.stdout.flush() print time_str sys.stdout.flush() # Save the hits for the current pixel hit_matrix.append(current_hits[0, :]) # Index the results based on the given metabolite database score = [] id_data = [] name = [] mass = [] n_peaks = [] n_match = [] pixel_index = [] if len(metabolite_database) > 0: # We don't have an empty string for current_index, spectrum_index in enumerate(spectrum_indexes): non_zero_scores = np.where(hit_matrix[current_index] > 0) if non_zero_scores.size > 0: current_hit_table = np.asarray( score_frag_dag.make_pactolus_hit_table( pactolus_results=hit_matrix[current_index], table_file=file_lookup_table, original_db=metabolite_database)) for score_index in non_zero_scores: pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2]) score.append(current_hit_table['score'][score_index]) id_data.append(current_hit_table['id'][score_index]) name.append(current_hit_table['name'][score_index]) mass.append(current_hit_table['mass'][score_index]) n_peaks.append( current_hit_table['n_peaks'][score_index]) n_match.append( current_hit_table['n_match'][score_index]) else: pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] score = np.asarray(hit_matrix) # Return the hit_table and the index of the pixel each hit_table applies to print "rank : " + str( mpi_helper.get_rank()) + " : scores " + str(score) sys.stdout.flush() return np.asarray(pixel_index), \ np.asarray(score), \ np.asarray(id_data), \ np.asarray(name), \ np.asarray(mass), \ np.asarray(n_peaks), \ np.asarray(n_match)
def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None): """ Execute the local peak finder for the given msidata. :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra that should be processed by this MPI task. If spectrum_indexes is set, then the given subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass. :returns: A series of numpy arrays with the score data for each pixel and a 2D array of pixel indices describing for each spectrum the (x,y) pixel location in the image. ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match'] * 'pixel_index' , int, 2D array of pixel indices describing for each spectrum \ the (x,y) pixel location in the imag * 'score', float, MIDAS score of row * 'id', str, database ID e.g. 'MetaCyC_7884' * 'name', str, database name, e.g. 'glycine' * 'mass', float, mass in Da of IDed compound * 'n_peaks', int, number of peaks in data * 'n_match', int, number of peaks in data matched """ log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root) # Get the data we need to process fpl_data = self['fpl_data'] fpl_peak_mz = fpl_data['peak_mz'] fpl_peak_value = fpl_data['peak_value'] fpl_peak_arrayindex = fpl_data['peak_arrayindex'] # Calculate the parent_mass precursor_mz = self['precursor_mz'] if precursor_mz == -1: precursor_mz = self['fpl_data']['precursor_mz'][:] # Assign parameter settings to local variables for convenience metabolite_database = self['metabolite_database'] ms1_mass_tol = self['ms1_mass_tolerance'] ms2_mass_tol = self['ms2_mass_tolerance'] neutralizations = self['neutralizations'] max_depth = self['max_depth'] # Make the numpy array with the list of tree files and their MS1 masses if file_lookup_table is None: # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root) if os.path.isfile(self['trees']): if self['trees'].endswith('.npy'): file_lookup_table = np.load(self['trees']) else: in_treefile = open(self['trees'], 'r') tree_files = [line.rstrip('\n') for line in in_treefile] in_treefile.close() file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(tree_files=tree_files) elif os.path.isdir(self['trees']): file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(path=self['trees']) # Define the common pactolus paramters pactolus_parameters = {'file_lookup_table': file_lookup_table, 'ms1_mass_tol': ms1_mass_tol, 'ms2_mass_tol': ms2_mass_tol, 'neutralizations': neutralizations, 'max_depth': max_depth} # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array # where we can find the spectrum that we need to processes num_spectra = fpl_peak_arrayindex.shape[0] if spectrum_indexes is None: # Get the complete peak array index data spectrum_indexes = np.arange(0, num_spectra) enable_parallel = True else: if isinstance(spectrum_indexes, int): spectrum_indexes = np.asarray([spectrum_indexes, ]) enable_parallel = False ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if enable_parallel: log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root) # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = [0, ] scheduler = mpi_helper.parallel_over_axes( task_function=self.execute_analysis, # Execute this function task_function_params={'file_lookup_table': file_lookup_table}, # Reuse the file_lookup_table main_data=spectrum_indexes, # Process the spectra independently split_axes=split_axis, # Split along axes main_data_param_name='spectrum_indexes', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel scheduling scheme comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # Compile the data from the parallel execution pixel_index = np.zeros((0, 2), dtype='int') score = np.zeros((0,), dtype='f4') id_data = np.zeros((0,), dtype='a100') name = np.zeros((0,), dtype='a100') mass = np.zeros((0,), dtype='f4') n_peaks = np.zeros((0,), dtype='i4') n_match = np.zeros((0,), dtype='i4') use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # TODO NEED to update since collect now returns a single list not a list of lists if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule): # We did not process any data on the root process when using dynamic scheduling # and we did not collect the data to the root either pass # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root: # temp_data = [ri[0] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # hit_table = np.concatenate(tuple(temp_data), axis=-1) # temp_data = [ri[1] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1 else: log_helper.debug(__name__, 'Compiling output') # Compile pixel_index temp_data = [ri[0] for ri in result[0]] if len(temp_data) > 0: pixel_index = np.concatenate(tuple(temp_data), axis=0) temp_data = [ri[1] for ri in result[0]] # Compile scores if len(temp_data) > 0: score = np.concatenate(tuple(temp_data), axis=0) # Compile id temp_data = [ri[2] for ri in result[0]] if len(temp_data) > 0: id_data = np.concatenate(tuple(temp_data), axis=0) # Compile name temp_data = [ri[3] for ri in result[0]] if len(temp_data) > 0: name = np.concatenate(tuple(temp_data), axis=0) # Compile mass temp_data = [ri[4] for ri in result[0]] if len(temp_data) > 0: mass = np.concatenate(tuple(temp_data), axis=0) # Compile n_peaks temp_data = [ri[5] for ri in result[0]] if len(temp_data) > 0: n_peaks = np.concatenate(tuple(temp_data), axis=0) # Compile n_match temp_data = [ri[6] for ri in result[0]] if len(temp_data) > 0: n_match = np.concatenate(tuple(temp_data), axis=0) log_helper.log_var(__name__, score=score) # Return the compiled output return pixel_index, score, id_data, name, mass, n_peaks, n_match ############################################################# # Serial processing of the current data block ############################################################# log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root) # Initialize the output data structures # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] # if len(pixel_index.shape) == 1: # pixel_index = pixel_index[np.newaxis, :] hit_matrix = [] # Iterate through all the pixel we were asked to process in serial for current_index, spectrum_index in enumerate(spectrum_indexes): # Determine the start and stop index for the m/z and intensity data of the current spectrum start = int(fpl_peak_arrayindex[spectrum_index, 2]) stop = int(fpl_peak_arrayindex[(spectrum_index+1), 2] if spectrum_index < (num_spectra-1) else fpl_peak_value.size) spectrum_length = stop - start # Skip empty spectra if spectrum_length == 0: time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored." log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) continue # Load the m/z and intensity values for the current spectrum current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float) current_peaks_list[:, 0] = fpl_peak_mz[start:stop] current_peaks_list[:, 1] = fpl_peak_value[start:stop] # Get the parent mass current_parent_mass = precursor_mz if len(precursor_mz) == 1 else precursor_mz[spectrum_index] start_time = time.time() # Call MIDAS to score the current spectrum against all compounds in the database current_hits = score_frag_dag.score_scan_list_against_trees(scan_list=[current_peaks_list, ], ms1_mz=[current_parent_mass, ], params=pactolus_parameters) end_time = time.time() execution_time = end_time - start_time time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time) time_str += " : num hits : " + str((current_hits > 0).sum()) #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) #sys.stdout.flush() print time_str sys.stdout.flush() # Save the hits for the current pixel hit_matrix.append(current_hits[0, :]) # Index the results based on the given metabolite database score = [] id_data = [] name = [] mass = [] n_peaks = [] n_match = [] pixel_index = [] if len(metabolite_database) > 0: # We don't have an empty string for current_index, spectrum_index in enumerate(spectrum_indexes): non_zero_scores = np.where(hit_matrix[current_index] > 0) if non_zero_scores.size > 0: current_hit_table = np.asarray(score_frag_dag.make_pactolus_hit_table( pactolus_results=hit_matrix[current_index], table_file=file_lookup_table, original_db=metabolite_database)) for score_index in non_zero_scores: pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2]) score.append(current_hit_table['score'][score_index]) id_data.append(current_hit_table['id'][score_index]) name.append(current_hit_table['name'][score_index]) mass.append(current_hit_table['mass'][score_index]) n_peaks.append(current_hit_table['n_peaks'][score_index]) n_match.append(current_hit_table['n_match'][score_index]) else: pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] score = np.asarray(hit_matrix) # Return the hit_table and the index of the pixel each hit_table applies to print "rank : " + str(mpi_helper.get_rank()) + " : scores " + str(score) sys.stdout.flush() return np.asarray(pixel_index), \ np.asarray(score), \ np.asarray(id_data), \ np.asarray(name), \ np.asarray(mass), \ np.asarray(n_peaks), \ np.asarray(n_match)
def execute_analysis(self, msidata_subblock=None): """ Execute the local peak finder for the given msidata. :param msidata_subblock: Optional input parameter used for parallel execution of the analysis only. If msidata_subblock is set, then the given subblock will be processed in SERIAL instead of processing self['msidata'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only. """ # Make sure needed imports are available from omsi.analysis.findpeaks.third_party.findpeaks import findpeaks import numpy as np # Assign parameters to local variables for convenience msidata = self['msidata'] if msidata_subblock is not None: msidata = msidata_subblock mzdata = self['mzdata'] integration_width = self['integration_width'] peakheight = self['peakheight'] slwindow = self['slwindow'] smoothwidth = self['smoothwidth'] print_status = self['printStatus'] if print_status: import sys ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(self['msidata'].shape) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if msidata_subblock is None: # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = range( len(self['msidata'].shape) - 1) # The axes along which we can split the data scheduler = mpi_helper.parallel_over_axes( task_function=self. execute_analysis, # Execute this function task_function_params={}, # No added parameters main_data=msidata, # Process the msidata split_axes=split_axis, # Split along axes main_data_param_name='msidata_subblock', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel schedule comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # TODO Record runtime information data from the scheduler in our provenance data # self.run_info['SCHEDULER_blocks'] = scheduler.blocks # self.run_info['SCHEDULER_block_times'] = scheduler.block_times # self.run_info['SCHEDULER_run_time'] = scheduler.run_time # self.run_info['SCHEDULER_schedule'] = scheduler.schedule # Compile the data from the parallel execution # Case Table: # # collect + worker 2 # worker 2 # collect + root 3 # root 1 use_dynamic_schedule = ( self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # Case 1: root rank without collect data disabled if mpi_helper.get_rank( ) == self.mpi_root and not self['collect']: # We did not process any data on the root if DYNAMIC scheduling was used if use_dynamic_schedule: return None, None, None, mzdata # We processed a data block using dynamic scheduling else: return result[0][0] # Case 2: Compile the data on the worker elif mpi_helper.get_rank( ) != self.mpi_root: # and use_dynamic_schedule: # Compile the results from all processing task (on workers) or from all workers (on the root) peak_mz = np.concatenate(tuple([ri[0] for ri in result[0]]), axis=-1) peak_values = np.concatenate(tuple( [ri[1] for ri in result[0]]), axis=-1) if len( result[1] ) > 1: # Correct indices from the individual runs since they all start at 0 peak_arrayindex = np.asarray([[b[0], b[1], 0] for b in result[1]]) peak_arrayindex[:, 2] = np.cumsum( [0] + [len(ri[0]) for ri in result[0]])[:-1] else: peak_arrayindex = result[0][0][2] mzdata = result[0][0][3] return peak_mz, peak_values, peak_arrayindex, mzdata # Case 3: Compile collected data on the root elif mpi_helper.get_rank( ) == self.mpi_root: # and use_dynamic_schedule: # Compile the results from all processing task (on workers) or from all workers (on the root) peak_mz = np.concatenate(tuple([ri[0] for ri in result[0]]), axis=-1) peak_values = np.concatenate(tuple( [ri[1] for ri in result[0]]), axis=-1) # Dynamic scheduling uses selections of (int,int,slice) while the static # scheduling uses (slice, slice, slice), hence we need to compile the peak_arrayindex # slightly differently depending on the scheduler used if use_dynamic_schedule: peak_arrayindex = np.asarray([[b[0], b[1], 0] for b in result[1]]) peak_arrayindex[:, 2] = np.cumsum( [0] + [len(ri[0]) for ri in result[0]])[:-1] else: peak_arrayindex = np.concatenate(tuple( [ri[2] for ri in result[0]]), axis=0) d = np.cumsum([0] + [len(ri[0]) for ri in result[0]]) d2 = np.cumsum([0] + [len(ri[2]) for ri in result[0]]) for di in range(len(d2) - 1): peak_arrayindex[d2[di]:d2[di + 1], 2] += d[di] mzdata = result[0][0][3] return peak_mz, peak_values, peak_arrayindex, mzdata ############################################################# # Serial processing of the current data block ############################################################# # Ensure the our MSI dataset has sufficient numbers of dimensions if len(msidata.shape) == 1: msidata = msidata[:][np.newaxis, np.newaxis, :] elif len(msidata.shape) == 2: msidata = msidata[:][np.newaxis, :] # Determine the data dimensions shape_x = msidata.shape[0] shape_y = msidata.shape[1] peak_mz = [] # The x values for all peaks, stored in a linear array peak_values = [ ] # The y values for all peaks, stored in a linear array # List describing for each pixel the start index where its peaks # are stored in the peaks_MZ and peaks_values array peak_arrayindex = np.zeros(shape=(shape_x * shape_y, 3), dtype='int64') current_index = long(0) pixel_index = 0 for xi in xrange(0, shape_x): for yi in xrange(0, shape_y): if print_status: sys.stdout.write("[" + str( int(100. * float(pixel_index) / float(shape_x * shape_y))) + "%]" + "\r") sys.stdout.flush() # Load the spectrum y = msidata[xi, yi, :] # Find peaks in the spectrum peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow, peakheight) y = peak_finder.smoothListGaussian() # from the smoothed spectra subtract a sliding minima peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow, peakheight) slmin = [x for x in peak_finder.sliding_window_minimum()] y = y - slmin # find peaks in the smoothed, background subtracted spectra peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow, peakheight) [pkmax, pkmin] = peak_finder.peakdet() xp = [x[0] for x in pkmax] yp = [x[1] for x in pkmax] peak_mz = peak_mz + xp peak_values = peak_values + yp peak_arrayindex[pixel_index, 0] = xi peak_arrayindex[pixel_index, 1] = yi peak_arrayindex[pixel_index, 2] = current_index pixel_index += 1 current_index += len(yp) # Add the analysis results and parameters to the anlaysis data so that it can be accessed and written to file # We here convert the single scalars to 1D numpy arrays to ensure consistency. The data write function can # handle also a large range of python built_in types by converting them to numpy for storage in HDF5 but # to ensure a consitent behavior we convert the values directly here # Save the analysis data to the __data_list so that the data can be # saved automatically by the omsi HDF5 file API return np.asarray(peak_mz), np.asarray( peak_values), peak_arrayindex, mzdata[:]