def __read_all(self, filename): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes. """ self.data = np.zeros(shape=self.shape, dtype=self.data_type) log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape]) reader = ImzMLParser(filename) log_helper.debug(__name__, 'READING ALL DATA!! GIVE ME RAM (please)!') # Compute the bin edges for reinterpolation if needed if self.imzml_type == self.available_imzml_types['processed']: shift = np.diff(self.mz).mean() bin_edges = np.append(self.mz, self.mz[-1] + shift) else: bin_edges = None for ind in xrange(0, len(reader.coordinates)): xidx, yidx = reader.coordinates[ind] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min # Read the spectrum mz, intens = reader.getspectrum(ind) # Reinterpolate intensities if we are in processed mode if bin_edges is not None: intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) # Save the intensity values in our data cube self.data[xidx, yidx, :] = intens
def remove_output_target(self): """ This function is used to delete any output target files created by the command line driver. This is done in case that an error occurred and we do not want to leave garbage files left over. *Side effects* The function modifies ``self.output_target`` :return: Boolean indicating whether we succesfully cleaned up the output """ success = False if self.__output_target_self is not None: try: os.remove(self.__output_target_self) log_helper.info( __name__, "Successfully removed output target: " + unicode(self.__output_target_self)) success = True except: log_helper.error( __name__, "Clean-up of output failed. File may be left on system: " + unicode(self.__output_target_self)) elif self.output_target is not None: log_helper.info( __name__, "Output target not removed because it was not created " + "by the analysis but potentially modified by it") else: success = True return success
def remove_output_target(self): """ This function is used to delete any output target files created by the command line driver. This is done in case that an error occurred and we do not want to leave garbage files left over. *Side effects* The function modifies ``self.output_target`` :return: Boolean indicating whether we succesfully cleaned up the output """ success = False if self.__output_target_self is not None: try: os.remove(self.__output_target_self) log_helper.info(__name__, "Successfully removed output target: " + unicode(self.__output_target_self)) success = True except: log_helper.error(__name__, "Clean-up of output failed. File may be left on system: " + unicode(self.__output_target_self)) elif self.output_target is not None: log_helper.info(__name__, "Output target not removed because it was not created " + "by the analysis but potentially modified by it") else: success = True return success
def __read_all(self, filename): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes. """ self.data = np.zeros(shape=self.shape, dtype=self.data_type) log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape]) reader = ImzMLParser(filename) log_helper.debug(__name__,'READING ALL DATA!! GIVE ME RAM (please)!') # Compute the bin edges for reinterpolation if needed if self.imzml_type == self.available_imzml_types['processed']: shift = np.diff(self.mz).mean() bin_edges = np.append(self.mz, self.mz[-1]+ shift) else: bin_edges = None for ind in xrange(0, len(reader.coordinates)): xidx, yidx = reader.coordinates[ind] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min # Read the spectrum mz, intens = reader.getspectrum(ind) # Reinterpolate intensities if we are in processed mode if bin_edges is not None: f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False) intens = f(self.mz) #intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) # Save the intensity values in our data cube self.data[xidx, yidx, :] = intens
def append(self, analysis_object): """ Add a given analysis to the set of object to be executed by the workflow This is the same as set.add() but we ensure that only analysis_base objects are added. :param analysis_object: Analysis object to be added to the execution. All dependencies of the analysis will also be executed as part of the execution. :type analysis_object: omsi.analysis.base.analysis_base :raises: ValueError is raised if the given analysis_object is invalid """ from omsi.analysis.base import analysis_base if isinstance(analysis_object, analysis_base): if analysis_object in self: log_helper.debug(__name__, "Analysis already in the list of tasks") return log_helper.info( __name__, "Adding analysis object to the workflow set. " + str(analysis_object)) super(analysis_task_list, self).append(analysis_object) else: raise ValueError( 'Analysis is not of type omsi.analysis.base.analysis_base')
def __init__(self, basename, requires_slicing=True, resolution=5): """ Open an img file for data reading. :param basename: The name of the mzml file. If basename is a directory, then the first mzML file found in the directory will be used instead. :type basename: string :param requires_slicing: Should the complete data be read into memory (this makes slicing easier). (default is True) :type requires_slicing: bool :param resolution: For profile data only, the minimum m/z spacing to use for creating the "full" reprofiled data cube :type resolution: float """ # Determine the correct base if os.path.isdir(basename): filelist = self.get_files_from_dir(basename) if len(filelist) > 0: basename = filelist[0] else: raise ValueError("No valid mzML file found in the given directory.") # self.basename = basename # self.requires_slicing = requires_slicing # Call super constructor. This sets self.basename and self.readall super(xmassmzml_file, self).__init__(basename=basename, requires_slicing=requires_slicing) self.resolution = resolution self.data_type = 'uint32' # TODO What data type should we use for the interpolated data? self.num_scans = self.__compute_num_scans(filename=self.basename) log_helper.info(__name__, 'Read %s scans from mzML file.' % self.num_scans) log_helper.debug(__name__, 'Compute coordinates') self.coordinates = self.__compute_coordinates(filename=self.basename,num_scans=self.num_scans) # Compute the spatial configuration of the matrix self.x_pos = np.unique(self.coordinates[:, 0]) self.y_pos = np.unique(self.coordinates[:, 1]) self.step_size = min([min(np.diff(self.x_pos)), min(np.diff(self.y_pos))]) # Compute the mz axis log_helper.debug(__name__, 'Compute mz axes') self.mz = self.__compute_mz_axis(filename=self.basename) log_helper.debug(__name__, 'mz axes computed') # Determine the shape of the dataset, result is a list of shapes for each datacube # self.shape_all_data = [(self.x_pos.shape[0], self.y_pos.shape[0], mz.shape[0]) for mz in self.mz_all log_helper.debug(__name__, 'Compute shape') self.shape = (self.x_pos.shape[0], self.y_pos.shape[0], len(self.mz))#self.shape[0]) # self.shape = None # self.mz = None # Read the data into memory # self.data = None log_helper.debug(__name__, 'read all') if requires_slicing: self.data = self.__read_all() log_helper.debug(__name__, 'Finished with init')
def __read_all(self): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes """ self.data = [ np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type) for scan_idx, scantype in enumerate(self.scan_types) ] for scan_idx, scantype in enumerate(self.scan_types): reader = mzml.read(self.basename) spectrumid = 0 if not self.scan_profiled[scan_idx]: shift = np.diff(self.mz_all[scan_idx]).mean() bin_edges = np.append(self.mz_all[scan_idx], self.mz_all[scan_idx][-1] + shift) else: bin_edges = None for spectrum in reader: if spectrum['scanList']['scan'][0][ 'filter string'] == scantype: x = spectrum['m/z array'] try: y = spectrum['intensity array'] except KeyError: raise KeyError if bin_edges is None: yi = np.interp( self.mz_all[scan_idx], x, y, 0, 0) # Re-interpolate the data in profiled mode else: yi, _ = np.histogram( x, bins=bin_edges, weights=y ) # Re-histogram the data in centroided mode xidx = np.nonzero( self.x_pos == self.coordinates[spectrumid, 0])[0] yidx = np.nonzero( self.y_pos == self.coordinates[spectrumid, 1])[0] try: self.data[scan_idx][xidx, yidx, :] = yi except: log_helper.debug(__name__, spectrumid, scan_idx, scantype, self.mz_all[scan_idx].shape) # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly if spectrumid % 1000 == 0: log_helper.info( __name__, 'Processed data for %s spectra to datacube for scan type %s' % (spectrumid, scantype)) spectrumid += 1
def collect_data(self, force_collect=False): """ Collect the results from the parallel execution to the self.root rank. NOTE: On the root the self.result, self.blocks, and self.block_times variables are updated with the collected data as well and self.__data_collected will be set NOTE: If the data has already been collected previously (ie., collect_data has been called before), then the collection will not be performed again, unless force_collect is set. :param force_collect: Set this parameter to force that data collection is performed again. By default the collect_data is performed only once for each time the run(..) function is called and the results are reused to ensure consistent data structures. We can force that collect will be reexecuted anyways by setting force_collect. :return: On worker ranks (i.e., MPI_RANK!=self.root) this is simply the self.result and self.blocks containing the result created by run function. On the root rank (i.e., MPI_RANK!=self.root) this is a tuple of two lists containing the combined data of all self.result and self.blocks from all ranks respectively. """ try: from omsi.shared.log import log_helper except ImportError: from pactolus.third_party.log import log_helper # If we have collected the data already then we don't need to do it again if self.__data_collected and not force_collect: return self.result, self.blocks # Collect the output rank = get_rank(comm=self.comm) start_time = time.time() if rank == self.root: log_helper.info(__name__, "COLLECTING RESULTS") # Collect the data, blocks, and block_times from all ranks collected_data = self.comm.gather(self.result, root=self.root) collected_blocks = self.comm.gather(self.blocks, root=self.root) # Save the data to self.result, self.block, self.block_times if we are the root if rank == self.root: # Merge the results from all the processes into a single result and blocks list # rather than having a list of lists of results self.result = list(itertools.chain.from_iterable(collected_data)) self.blocks = list(itertools.chain.from_iterable(collected_blocks)) # Record the time we used to collect the data end_time = time.time() run_time = end_time - start_time if rank == self.root: log_helper.info(__name__, "TIME FOR COLLECTING DATA FROM ALL TASKS: " + str(run_time)) # Return the result self.__data_collected = True return self.result, self.blocks
def update_job_status(filepath, db_server, jobid, status='complete'): """ Function used to update the status of the job on the server :param filepath: Path of the file to be added to the database (only needed update file permissions) :param db_server: The database server url :param jobid: The id of the current job. :param status: One of 'running', 'complete' or 'error' """ import urllib2 import urllib # If we are at NERSC then set the NERSC Apache permissions if 'nersc.gov' in db_server: WebHelper.set_apache_acl(filepath) # Construct the db add-file url update_status_url = os.path.join(db_server, "openmsi/processing/update") query_params = {'jobid': jobid, 'status': status} update_status_url += "?" update_status_url += urllib.urlencode(query_params) # Make the url request try: log_helper.info(__name__, "Updating job status: " + update_status_url) url_response = urllib2.urlopen(url=update_status_url) if url_response.code == 200: return True except urllib2.HTTPError as request_error: raise ValueError("ERROR: job status could not be updated: \n" + " Error-code:" + str(request_error.code) + "\n" + " Error info:" + str(request_error.read())) except urllib2.URLError as request_error: if sys.version_info >= (2, 7, 9): import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE url_response = urllib2.urlopen(url=update_status_url, context=ssl_context) if url_response.code == 200: return True else: raise ValueError("ERROR: job status could not be updated: \n" + " Error-code:" + str(request_error.code) + "\n" + " Error info:" + str(request_error.read()))
def collect_data(self, force_collect=False): """ Collect the results from the parallel execution to the self.root rank. NOTE: On the root the self.result, self.blocks, and self.block_times variables are updated with the collected data as well and self.__data_collected will be set NOTE: If the data has already been collected previously (ie., collect_data has been called before), then the collection will not be performed again, unless force_collect is set. :param force_collect: Set this parameter to force that data collection is performed again. By default the collect_data is performed only once for each time the run(..) function is called and the results are reused to ensure consistent data structures. We can force that collect will be reexecuted anyways by setting force_collect. :return: On worker ranks (i.e., MPI_RANK!=self.root) this is simply the self.result and self.blocks containing the result created by run function. On the root rank (i.e., MPI_RANK!=self.root) this is a tuple of two lists containing the combined data of all self.result and self.blocks from all ranks respectively. """ from omsi.shared.log import log_helper # If we have collected the data already then we don't need to do it again if self.__data_collected and not force_collect: return self.result, self.blocks # Collect the output rank = get_rank(comm=self.comm) start_time = time.time() if rank == self.root: log_helper.info(__name__, "COLLECTING RESULTS") # Collect the data, blocks, and block_times from all ranks collected_data = self.comm.gather(self.result, root=self.root) collected_blocks = self.comm.gather(self.blocks, root=self.root) # Save the data to self.result, self.block, self.block_times if we are the root if rank == self.root: # Merge the results from all the processes into a single result and blocks list # rather than having a list of lists of results self.result = list(itertools.chain.from_iterable(collected_data)) self.blocks = list(itertools.chain.from_iterable(collected_blocks)) # Record the time we used to collect the data end_time = time.time() run_time = end_time - start_time if rank == self.root: log_helper.info(__name__, "TIME FOR COLLECTING DATA FROM ALL TASKS: " + str(run_time)) # Return the result self.__data_collected = True return self.result, self.blocks
def set_apache_acl(filepath): """ Helper function used to set acl permissions for apache to make the given file accesible to Apache at NERSC. This necessary to make the file readable for adding it to the database. :param filepath: String with the path to the file for which ACL permission should be set """ log_helper.info(__name__, "Setting NERSC ACL permissions for Apache") # Note u:48 is a replacement for u:apache to ensure that # that the command works properly on edison.nersc.gov which # does not have the apache user. However u:48 is equivalent. command = "setfacl -R -m u:48:rwx " + '"' + filepath + '"' os.system(command)
def set_apache_acl(filepath): """ Helper function used to set acl permissions for apache to make the given file accesible to Apache at NERSC. This necessary to make the file readable for adding it to the database. :param filepath: String with the path to the file for which ACL permission should be set """ log_helper.info(__name__, "Setting NERSC ACL permissions for Apache") # Note u:48 is a replacement for u:apache to ensure that # that the command works properly on edison.nersc.gov which # does not have the apache user. However u:48 is equivalent. command = "setfacl -R -m u:48:rwx " + '"' + filepath + '"' os.system(command)
def insert(self, index, analysis_object): """ Insert a given analysis object at the given location :param index: Location where the obejct should be inserted :param analysis_object: The analysis object to be inserted """ from omsi.analysis.base import analysis_base if isinstance(analysis_object, analysis_base): if analysis_object in self: log_helper.debug(__name__, "Analysis already in the list of tasks") return log_helper.info(__name__, "Inserting analysis object in the workflow list. " + str(analysis_object)) super(analysis_task_list, self).insert(index, analysis_object) else: raise ValueError('Analysis is not of type omsi.analysis.base.analysis_base')
def update_job_status(filepath, db_server, jobid, status='complete'): """ Function used to update the status of the job on the server :param filepath: Path of the file to be added to the database (only needed update file permissions) :param db_server: The database server url :param jobid: The id of the current job. :param status: One of 'running', 'complete' or 'error' """ import urllib2 import urllib # If we are at NERSC then set the NERSC Apache permissions if 'nersc.gov' in db_server: WebHelper.set_apache_acl(filepath) # Construct the db add-file url update_status_url = os.path.join(db_server, "openmsi/processing/update") query_params = {'jobid': jobid, 'status': status} update_status_url += "?" update_status_url += urllib.urlencode(query_params) # Make the url request try: log_helper.info(__name__, "Updating job status: " + update_status_url) url_response = urllib2.urlopen(url=update_status_url) if url_response.code == 200: return True except urllib2.HTTPError as request_error: raise ValueError("ERROR: job status could not be updated: \n" + " Error-code:" + str(request_error.code) + "\n" + " Error info:" + str(request_error.read())) except urllib2.URLError as request_error: if sys.version_info >= (2, 7, 9): import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE url_response = urllib2.urlopen(url=update_status_url, context=ssl_context) if url_response.code == 200: return True else: raise ValueError("ERROR: job status could not be updated: \n" + " Error-code:" + str(request_error.code) + "\n" + " Error info:" + str(request_error.read()))
def print_settings(self): """ Print the analysis settings. """ log_helper.info(__name__, "Inputs:") for key, value in self.analysis_arguments.iteritems(): log_helper.info(__name__, " " + unicode(key) + " = " + unicode(value)) if self.output_target is not None: if isinstance(self.output_target, omsi_file_common): h5py_object = omsi_file_common.get_h5py_object(self.output_target) log_helper.info(__name__, "Save to: " + unicode(h5py_object.file.filename) + u":" + unicode(h5py_object.name)) else: log_helper.info(__name__, "Save to: " + unicode(self.output_target))
def __init__(self, basename, requires_slicing=True, resolution=15): """ Open an imzml file for data reading. :param basename: The name of the mzml file. If basename is a directory, then the first mzML file found in the directory will be used instead. :type basename: string :param requires_slicing: Should the complete data be read into memory (this makes slicing easier). (default is True) :type requires_slicing: bool :param resolution: For processed data only, the minimum m/z spacing to use for creating the "full" reprofiled data cube :type resolution: float """ # Determine the correct base if os.path.isdir(basename): filelist = self.get_files_from_dir(basename) if len(filelist) > 0: basename = filelist[0] else: raise ValueError( "No valid imzML file found in the given directory.") # Call super constructor. This sets self.basename and self.readall super(imzml_file, self).__init__(basename=basename, requires_slicing=requires_slicing) self.resolution = resolution # Compute the mz axis, pixel coordinates data type etc. self.coordinates, self.mz, self.data_type, self.imzml_type, self.dataset_metadata, self.instrument_metadata, \ self.method_metadata = self.__compute_file_info(filename=self.basename, resolution=self.resolution) self.num_scans = self.coordinates.size log_helper.info(__name__, 'Read %s scans from imzML file.' % self.num_scans) # Compute step size self.x_pos = np.unique(self.coordinates[:, 0]) self.y_pos = np.unique(self.coordinates[:, 1]) self.x_pos_min = self.x_pos.min() self.y_pos_min = self.y_pos.min() # self.step_size = min([min(np.diff(self.x_pos)), min(np.diff(self.y_pos))]) # Determine the shape of the dataset ## TODO: after solving imzML generation prob, fix this for multicube data num_x = self.x_pos.max() - self.x_pos.min() + 1 num_y = self.y_pos.max() - self.y_pos.min() + 1 self.shape = (num_x, num_y, self.mz.size) # Read the data into memory self.data = None if requires_slicing: self.__read_all(filename=basename) log_helper.info(__name__, "IMZML file type: " + str(self.imzml_type)) log_helper.info(__name__, "IMZML data type: " + str(self.data_type))
def __read_all(self): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes """ self.data = [np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type) for scan_idx, scantype in enumerate(self.scan_types)] for scan_idx, scantype in enumerate(self.scan_types): reader = mzml.read(self.basename) spectrumid = 0 if not self.scan_profiled[scan_idx]: shift = np.diff(self.mz_all[scan_idx]).mean() bin_edges = np.append(self.mz_all[scan_idx], self.mz_all[scan_idx][-1]+ shift) else: bin_edges = None for spectrum in reader: if spectrum['scanList']['scan'][0]['filter string'] == scantype: x = spectrum['m/z array'] try: y = spectrum['intensity array'] except KeyError: raise KeyError if bin_edges is None: yi = np.interp(self.mz_all[scan_idx], x, y, 0, 0) # Re-interpolate the data in profiled mode else: yi, _ = np.histogram(x, bins=bin_edges, weights=y) # Re-histogram the data in centroided mode xidx = np.nonzero(self.x_pos == self.coordinates[spectrumid, 0])[0] yidx = np.nonzero(self.y_pos == self.coordinates[spectrumid, 1])[0] try: self.data[scan_idx][xidx, yidx, :] = yi except: log_helper.debug(__name__, spectrumid, scan_idx, scantype, self.mz_all[scan_idx].shape) # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly if spectrumid%1000 == 0: log_helper.info(__name__, 'Processed data for %s spectra to datacube for scan type %s' % (spectrumid, scantype)) spectrumid += 1
def __init__(self, basename, requires_slicing=True, resolution=15): """ Open an imzml file for data reading. :param basename: The name of the mzml file. If basename is a directory, then the first mzML file found in the directory will be used instead. :type basename: string :param requires_slicing: Should the complete data be read into memory (this makes slicing easier). (default is True) :type requires_slicing: bool :param resolution: For processed data only, the minimum m/z spacing to use for creating the "full" reprofiled data cube :type resolution: float """ # Determine the correct base if os.path.isdir(basename): filelist = self.get_files_from_dir(basename) if len(filelist) > 0: basename = filelist[0] else: raise ValueError("No valid imzML file found in the given directory.") # Call super constructor. This sets self.basename and self.readall super(imzml_file, self).__init__(basename=basename, requires_slicing=requires_slicing) self.resolution=resolution # Compute the mz axis, pixel coordinates data type etc. self.coordinates, self.mz, self.data_type, self.imzml_type, self.dataset_metadata, self.instrument_metadata, \ self.method_metadata = self.__compute_file_info(filename=self.basename, resolution=self.resolution) self.num_scans = self.coordinates.size log_helper.info(__name__, 'Read %s scans from imzML file.' % self.num_scans) # Compute step size self.x_pos = np.unique(self.coordinates[:, 0]) self.y_pos = np.unique(self.coordinates[:, 1]) self.x_pos_min = self.x_pos.min() self.y_pos_min = self.y_pos.min() # self.step_size = min([min(np.diff(self.x_pos)), min(np.diff(self.y_pos))]) # Determine the shape of the dataset ## TODO: after solving imzML generation prob, fix this for multicube data num_x = self.x_pos.max() - self.x_pos.min() + 1 num_y = self.y_pos.max() - self.y_pos.min() + 1 self.shape = (num_x,num_y, self.mz.size) # Read the data into memory self.data = None if requires_slicing: self.__read_all(filename=basename) log_helper.info(__name__, "IMZML file type: " + str(self.imzml_type)) log_helper.info(__name__, "IMZML data type: " + str(self.data_type))
def print_settings(self): """ Print the analysis settings. """ log_helper.info(__name__, "Inputs:") for key, value in sorted(self.analysis_arguments.iteritems()): log_helper.info(__name__, " " + unicode(key) + " = " + unicode(value)) if self.output_target is not None: if isinstance(self.output_target, omsi_file_common): h5py_object = omsi_file_common.get_h5py_object( self.output_target) log_helper.info( __name__, "Save to: " + unicode(h5py_object.file.filename) + u":" + unicode(h5py_object.name)) else: log_helper.info(__name__, "Save to: " + unicode(self.output_target))
def main(self): """Execute the analysis workflow""" if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty") return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow") log_helper.info(__name__, "Adding all dependencies") self.add_analysis_dependencies() # Record the runtime information log_helper.debug(__name__, "Recording runtime information") self.run_info.clear() self.run_info.record_preexecute() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow") all_analyses = self.get_analyses() iterations = 0 while True: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len( analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis)) analysis.execute() # Check if there is any other tasks that we need to execte now num_tasks = 0 num_tasks_ready = 0 for analysis in all_analyses: if analysis.update_analysis: num_tasks += 1 if len(analysis.check_ready_to_execute()) == 0: num_tasks_ready += 1 if num_tasks == 0: log_helper.info(__name__, "Completed executing the workflow.") break if num_tasks > 0 and num_tasks_ready == 0: log_helper.warning( __name__, "Workflow could not be fully executed. " + str(num_tasks) + " remain in the queue but cannot be completed due to unresolved dependencies." ) iterations += 1 log_helper.log_var(__name__, iterations=iterations, level='DEBUG') # Record the runtime information after we are done with the workflow self.run_info.record_postexecute() self.run_info.gather()
def main(self): """ Execute the analysis workflow """ # Do the optional MPI barrier if self['synchronize']: mpi_helper.barrier(comm=self.mpi_comm) # Check if we have anything to do at all if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm) return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm) log_helper.info(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm) self.add_analysis_dependencies() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm) all_analyses = self.get_analyses() iterations = 0 while True: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis), root=self.mpi_root, comm=self.mpi_comm) analysis.execute() if self['reduce_memory_usage']: analysis.clear_and_restore() # Check if there is any other tasks that we need to execte now num_tasks = 0 num_tasks_ready = 0 for analysis in all_analyses: if analysis.update_analysis: num_tasks += 1 if len(analysis.check_ready_to_execute()) == 0: num_tasks_ready += 1 if num_tasks == 0: log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm) break if num_tasks > 0 and num_tasks_ready == 0: log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) + " remain in the queue but cannot be completed due to unresolved dependencies.", root=self.mpi_root, comm=self.mpi_comm) iterations += 1 log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
def main(self): """Execute the analysis workflow""" if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty") return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow") log_helper.info(__name__, "Adding all dependencies") self.add_analysis_dependencies() # Record the runtime information log_helper.debug(__name__, "Recording runtime information") self.run_info.clear() self.run_info.record_preexecute() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow") all_analyses = self.get_analyses() iterations = 0 while True: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis)) analysis.execute() # Check if there is any other tasks that we need to execte now num_tasks = 0 num_tasks_ready = 0 for analysis in all_analyses: if analysis.update_analysis: num_tasks += 1 if len(analysis.check_ready_to_execute()) == 0: num_tasks_ready += 1 if num_tasks == 0: log_helper.info(__name__, "Completed executing the workflow.") break if num_tasks > 0 and num_tasks_ready == 0: log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) + " remain in the queue but cannot be completed due to unresolved dependencies.") iterations += 1 log_helper.log_var(__name__, iterations=iterations, level='DEBUG') # Record the runtime information after we are done with the workflow self.run_info.record_postexecute() self.run_info.gather()
def register_file_with_db(filepath, db_server, file_user_name, jobid=None, check_add_nersc=True): """ Function used to register a given file with the database :param filepath: Path of the file to be added to the database :param db_server: The database server url :param file_user_name: The user to be used, or None if the user should be determined based on the file URL. :param jobid: Optional input parameter defining the jobid to be updated. If the jobid is given then the job will be updated with the database instead of adding the file explicitly. I.e., instead of register_filer_with_db the update_job_status call is executed. :param check_add_nersc: Boolean if set to True performs additional actions to add the file to the pupblic OpenMSI gateway hosted at NERSC. :returns: Boolean indicating whether the operation was successful """ import urllib2 import urllib if jobid is not None: return WebHelper.update_job_status(filepath=filepath, db_server=db_server, jobid=jobid, status='complete') # Check if the if db_server == WebHelper.default_db_server_url and check_add_nersc: is_allowed_path = False for allowed_nersc_location in WebHelper.allowed_nersc_locations: if filepath.startswith(allowed_nersc_location): is_allowed_path = True break if not is_allowed_path and file_user_name in WebHelper.super_users: print "WARNING: Attempt to add a file to openmsi.nersc.gov that is not in a default location." print "Do you want to add the file? (Y/N):" num_trys = 3 timeout = 5 * 60 # Timeout after 5 minutes for i in range(num_trys): # user_input = raw_input() user_input = UserInput.userinput_with_timeout( timeout=timeout, default=None) if user_input is None: warnings.warn( "WARNING: Attempt to add a file to openmsi.nersc.gov that," + " is not in a default location. Timeout occurred before" + " user confirmed. Aborted adding the file to the DB." ) return False if user_input == "Y" or user_input == "y" or user_input == "Yes" or \ user_input == "yes" or user_input == "YES": break elif user_input == "N" or user_input == "n" or user_input == "No" or \ user_input == "no" or user_input == "NO": return False else: if i == (num_trys - 1): warnings.warn( "WARNING: Attempt to add a file to openmsi.nersc.gov that," + " is not in a default location. User input unrecognized." + " Aborted adding the file to the DB.") return False print "Unrecognized response. Do you want to add the file? (Y/N): " elif not is_allowed_path: warnings.warn( "Adding file to the OpenMSI database in unconventional location not permitted for user." ) return False else: pass # Adding the file to the db is allowed # If we are at NERSC then set the NERSC Apache permissions if 'nersc.gov' in db_server and check_add_nersc: WebHelper.set_apache_acl(filepath) # Determine the user curr_user = file_user_name if not curr_user: curr_user = os.path.dirname(filepath).split("/")[-1] if not curr_user: raise ValueError( "ERROR: File could not be added to DB. Owner could not be determined." ) # Construct the db add-file url add_file_url = os.path.join(db_server, "openmsi/resources/addfile") addfilepath = filepath # Correct the filepath if we are on openmsi.nersc.gov, as /global is not mounted but only /project. if db_server == WebHelper.default_db_server_url and addfilepath.startswith( "/global/project/projectdirs"): addfilepath = filepath.lstrip("/global") query_params = { 'file': os.path.abspath(addfilepath), 'owner': curr_user } add_file_url += "?" add_file_url += urllib.urlencode(query_params) # add_file_url = add_file_url + "?file=" + \ # os.path.abspath(filepath) + "&user="******"Registering file with DB: " + add_file_url) url_response = urllib2.urlopen(url=add_file_url) if url_response.code == 200: return True except urllib2.HTTPError as request_error: raise ValueError("ERROR: File could not be added to DB: \n" + " Error-code:" + str(request_error.code) + "\n" + " Error info:" + str(request_error.read())) return False
def __init__(self, basename, requires_slicing=True, resolution=5): """ Open an img file for data reading. :param basename: The name of the mzml file. If basename is a directory, then the first mzML file found in the directory will be used instead. :type basename: string :param requires_slicing: Should the complete data be read into memory (this makes slicing easier). (default is True) :type requires_slicing: bool :param resolution: For profile data only, the minimum m/z spacing to use for creating the "full" reprofiled data cube :type resolution: float """ # Determine the correct base if os.path.isdir(basename): filelist = self.get_files_from_dir(basename) if len(filelist) > 0: basename = filelist[0] else: raise ValueError( "No valid mzML file found in the given directory.") # self.basename = basename # self.requires_slicing = requires_slicing # Call super constructor. This sets self.basename and self.readall super(xmassmzml_file, self).__init__(basename=basename, requires_slicing=requires_slicing) self.resolution = resolution self.data_type = 'uint32' # TODO What data type should we use for the interpolated data? self.num_scans = self.__compute_num_scans(filename=self.basename) log_helper.info(__name__, 'Read %s scans from mzML file.' % self.num_scans) log_helper.debug(__name__, 'Compute coordinates') self.coordinates = self.__compute_coordinates(filename=self.basename, num_scans=self.num_scans) # Compute the spatial configuration of the matrix self.x_pos = np.unique(self.coordinates[:, 0]) self.y_pos = np.unique(self.coordinates[:, 1]) self.step_size = min( [min(np.diff(self.x_pos)), min(np.diff(self.y_pos))]) # Compute the mz axis log_helper.debug(__name__, 'Compute mz axes') self.mz = self.__compute_mz_axis(filename=self.basename) log_helper.debug(__name__, 'mz axes computed') # Determine the shape of the dataset, result is a list of shapes for each datacube # self.shape_all_data = [(self.x_pos.shape[0], self.y_pos.shape[0], mz.shape[0]) for mz in self.mz_all log_helper.debug(__name__, 'Compute shape') self.shape = (self.x_pos.shape[0], self.y_pos.shape[0], len(self.mz) ) #self.shape[0]) # self.shape = None # self.mz = None # Read the data into memory # self.data = None log_helper.debug(__name__, 'read all') if requires_slicing: self.data = self.__read_all() log_helper.debug(__name__, 'Finished with init')
def register_file_with_db(filepath, db_server, file_user_name, jobid=None, check_add_nersc=True): """ Function used to register a given file with the database :param filepath: Path of the file to be added to the database :param db_server: The database server url :param file_user_name: The user to be used, or None if the user should be determined based on the file URL. :param jobid: Optional input parameter defining the jobid to be updated. If the jobid is given then the job will be updated with the database instead of adding the file explicitly. I.e., instead of register_filer_with_db the update_job_status call is executed. :param check_add_nersc: Boolean if set to True performs additional actions to add the file to the pupblic OpenMSI gateway hosted at NERSC. :returns: Boolean indicating whether the operation was successful """ import urllib2 import urllib if jobid is not None: return WebHelper.update_job_status(filepath=filepath, db_server=db_server, jobid=jobid, status='complete') # Check if the if db_server == WebHelper.default_db_server_url and check_add_nersc: is_allowed_path = False for allowed_nersc_location in WebHelper.allowed_nersc_locations: if filepath.startswith(allowed_nersc_location): is_allowed_path = True break if not is_allowed_path and file_user_name in WebHelper.super_users: print "WARNING: Attempt to add a file to openmsi.nersc.gov that is not in a default location." print "Do you want to add the file? (Y/N):" num_trys = 3 timeout = 5*60 # Timeout after 5 minutes for i in range(num_trys): # user_input = raw_input() user_input = UserInput.userinput_with_timeout(timeout=timeout, default=None) if user_input is None: warnings.warn("WARNING: Attempt to add a file to openmsi.nersc.gov that," + " is not in a default location. Timeout occurred before" + " user confirmed. Aborted adding the file to the DB.") return False if user_input == "Y" or user_input == "y" or user_input == "Yes" or \ user_input == "yes" or user_input == "YES": break elif user_input == "N" or user_input == "n" or user_input == "No" or \ user_input == "no" or user_input == "NO": return False else: if i == (num_trys - 1): warnings.warn("WARNING: Attempt to add a file to openmsi.nersc.gov that," + " is not in a default location. User input unrecognized." + " Aborted adding the file to the DB.") return False print "Unrecognized response. Do you want to add the file? (Y/N): " elif not is_allowed_path: warnings.warn("Adding file to the OpenMSI database in unconventional location not permitted for user.") return False else: pass # Adding the file to the db is allowed # If we are at NERSC then set the NERSC Apache permissions if 'nersc.gov' in db_server and check_add_nersc: WebHelper.set_apache_acl(filepath) # Determine the user curr_user = file_user_name if not curr_user: curr_user = os.path.dirname(filepath).split("/")[-1] if not curr_user: raise ValueError("ERROR: File could not be added to DB. Owner could not be determined.") # Construct the db add-file url add_file_url = os.path.join(db_server, "openmsi/resources/addfile") addfilepath = filepath # Correct the filepath if we are on openmsi.nersc.gov, as /global is not mounted but only /project. if db_server == WebHelper.default_db_server_url and addfilepath.startswith("/global/project/projectdirs"): addfilepath = filepath.lstrip("/global") query_params = {'file': os.path.abspath(addfilepath), 'owner': curr_user} add_file_url += "?" add_file_url += urllib.urlencode(query_params) # add_file_url = add_file_url + "?file=" + \ # os.path.abspath(filepath) + "&user="******"Registering file with DB: " + add_file_url) url_response = urllib2.urlopen(url=add_file_url) if url_response.code == 200: return True except urllib2.HTTPError as request_error: raise ValueError("ERROR: File could not be added to DB: \n" + " Error-code:" + str(request_error.code) + "\n" + " Error info:" + str(request_error.read())) return False
def __run_static_1D(self): """ Run the task function using a static task decomposition schema. The data is divided into sub-blocks along the largest split_axis :return: Tuple with the following elements: 1) List with the results from the local execution of the task_function. Each entry is the result from one return of the task_function. In the case of static execution, this is always a list of length 1. 2) List of block_indexes. Each block_index is a tuple with the selection used to divide the data into sub-blocks. In the case of static decomposition we have a range slice object along the axes used for decomposition. """ try: from omsi.shared.log import log_helper except ImportError: from pactolus.third_party.log import log_helper start_time = time.time() # Get MPI parameters rank = get_rank(comm=self.comm) size = get_size(comm=self.comm) # Get data shape parameters and compute the data blocks # Determine the longest axis along which we can split the data axes_shapes = np.asarray(self.main_data.shape)[self.split_axes] total_num_subblocks = np.prod(axes_shapes) if total_num_subblocks < size: size = total_num_subblocks if rank == self.root: log_helper.info(__name__, "Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle") axes_sort_index = np.argsort(axes_shapes)[::-1] split_axis = self.split_axes[axes_sort_index[0]] split_axis_size = axes_shapes[split_axis] if split_axis_size < size: raise NotImplementedError("STATIC scheduling currently parallelizes only over one axis, " + "and the largest axis is too small to fill all MPI tasks") # Determine the size of 1D block block_size = int(split_axis_size / float(size) + 0.5) if block_size * size > split_axis_size and block_size > 1: block_size -= 1 # Compute a block for every rank self.blocks = [slice(None)] * len(self.main_data.shape) start_index = rank * block_size stop_index = start_index + block_size if rank == (size-1): if stop_index != split_axis_size: stop_index = split_axis_size self.blocks[axes_sort_index[0]] = slice(start_index, stop_index) self.blocks = tuple(self.blocks) log_helper.info(__name__, "Rank: " + str(rank) + " Block: " + str(self.blocks)) # Execute the task_function on the given data block task_params = self.task_function_params task_params[self.main_data_param_name] = self.main_data[self.blocks] self.result = self.task_function(**task_params) end_time = time.time() run_time = end_time - start_time self.block_times = [run_time, ] log_helper.info(__name__, "TIME FOR PROCESSING THE DATA BLOCK: " + str(run_time)) # Return the output self.result = [self.result, ] self.blocks = [self.blocks, ] return self.result, self.blocks
def __run_dynamic(self): """ Run the task function using dynamic task scheduling. The root rank divides the data into sub-tasks and sends the tasks to available MPI processes on request. :return: Tuple with the following elements: 1) List with the results from the local execution of the task_function. Each entry is the result from one return of the task_function. 2) List of block_indexes. Each block_index is a tuple with the selection used to divide the data into sub-blocks. In the case of static decomposition we have a range slice object along the axes used for decomposition. """ try: from omsi.shared.log import log_helper except ImportError: from pactolus.third_party.log import log_helper import time rank = get_rank(comm=self.comm) size = get_size(comm=self.comm) if size < 2: warnings.warn('DYNAMIC task scheduling requires at least 2 MPI ranks. Using STATIC scheduling instead.') return self.__run_static_1D() # We are the controlling rank if rank == self.root: self.result = [] self.blocks = [] self.block_times = [] # Get data shape parameters and compute the data blocks axes_shapes = np.asarray(self.main_data.shape)[self.split_axes] total_num_subblocks = np.prod(axes_shapes) if total_num_subblocks < size: if rank == self.root: warnings.warn("Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle") # Compute the list of all possible blocks base_blocks = [[slice(None)]] * len(self.main_data.shape) for axis_index in self.split_axes: base_blocks[axis_index] = range(self.main_data.shape[axis_index]) block_tuples = itertools.product(*base_blocks) # Communicate blocks with task ranks log_helper.info(__name__, "PROCESSING DATA BLOCKS") start_time = time.time() block_index = 0 for block_selection in block_tuples: request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) self.comm.send((block_index, block_selection), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) block_index += 1 if (block_index % 100) == 0: log_helper.debug(__name__, str((block_index, total_num_subblocks, request_rank))) end_time = time.time() run_time = end_time - start_time log_helper.info(__name__, "TIME FOR SCHEDULING ALL TASKS: " + str(run_time)) start_time = time.time() log_helper.info(__name__, "FINALIZING") # Terminate all ranks and receive all data from the different ranks if requested all_ranks_status = np.zeros(size, 'bool') all_ranks_status[self.root] = True while not np.all(all_ranks_status): request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) self.comm.send((None, None), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) all_ranks_status[request_rank] = True end_time = time.time() run_time = end_time - start_time log_helper.info(__name__, "TIME FOR FINALIZING TASKS: " + str(run_time)) # We are a rank that has to run tasks else: # Request a new data block self.result = [] self.blocks = [] self.block_times = [] while True: start_time = time.time() self.comm.send(rank, dest=self.root, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) block_index, block_selection = self.comm.recv(source=self.root, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) if block_index is None: break # Execute the task_function on the given data block task_params = self.task_function_params task_params[self.main_data_param_name] = self.main_data[block_selection] self.result.append(self.task_function(**task_params)) self.blocks.append(block_selection) # Record the timings end_time = time.time() run_time = end_time - start_time self.block_times.append(run_time) # Return the result return self.result, self.blocks
def __compute_file_info(cls, filename, resolution): ## TODO completely refactor this to make it smartly handle profile or centroid datasets ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution ## TODO: profile datasets should work as is ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array'] """ Internal helper function used to compute the mz axis, data type for the intensities, format type :return: Numpy array with mz axis :return: string with data type :return: imzml file type :return: """ reader = ImzMLParser(filename) # Read the first spectrum mz_axes, intens = reader.getspectrum(0) # NOTE: mz_axes is a tuple # Read the coordinates coordinates = np.asarray(reader.coordinates) # Determine the data type for the internsity values dtype = np.asarray(intens).dtype.str # Compute the mz axis and file type file_type = cls.available_imzml_types['continuous'] min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes) for ind in range(coordinates.shape[0] ): #for ind, loc in enumerate(reader.coordinates): mz, intens = reader.getspectrum(ind) if mz == mz_axes: pass else: file_type = cls.available_imzml_types['processed'] if min_mz > np.amin(mz): min_mz = np.amin(mz) if max_mz < np.amax(mz): max_mz = np.amax(mz) # Reinterpolate the mz-axis if we have a processed mode imzml file if file_type == cls.available_imzml_types['processed']: f = np.ceil(1e6 * np.log(max_mz / min_mz) / resolution) mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f) log_helper.info( __name__, "Reinterpolated m/z axis for processed imzML file") # Construct the imzml metadata information dataset_metadata = metadata_dict() instrument_metadata = metadata_dict() method_metadata = metadata_dict() for k, v in reader.imzmldict.iteritems(): dataset_metadata[k] = metadata_value(name=k, value=v, unit=None, description=k, ontology=None) # Delete the parser and read the metadata del reader # Parse the metadata for the file. We try to parse only the header and ignore the # <run > group in the XML file to avoid going throught the whole file again # while extracting the majority of the relevant metadata try: with open(filename, 'r') as ins: metdata_header = '' for line in ins: if '<run' in line: break else: metdata_header += line metdata_header += '</mzML>' metdata_header_dict = xmltodict.parse(metdata_header)['mzML'] for k, v in metdata_header_dict.iteritems(): store_value = metadata_value( name=k, value=v, unit=None, description=str(k) + " extracted from imzML XML header.", ontology=None) if k == 'instrumentConfigurationList': instrument_metadata[k] = store_value elif k == 'dataProcessingList': method_metadata[k] = store_value elif k == 'scanSettingsList': dataset_metadata[k] = store_value elif k == 'softwareList': method_metadata[k] = store_value elif k == 'sampleList': method_metadata[k] = store_value else: dataset_metadata[k] = store_value dataset_metadata['imzml_xml_metadata_header'] = metadata_value( name='imzml_xml_metadata_header', value=metdata_header, unit=None, description='XML imzML header', ontology=None) except: log_helper.warning( __name__, "Extraction of additional imzML metadata failed") return coordinates, np.asarray( mz_axes ), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata
def __run_dynamic(self): """ Run the task function using dynamic task scheduling. The root rank divides the data into sub-tasks and sends the tasks to available MPI processes on request. :return: Tuple with the following elements: 1) List with the results from the local execution of the task_function. Each entry is the result from one return of the task_function. 2) List of block_indexes. Each block_index is a tuple with the selection used to divide the data into sub-blocks. In the case of static decomposition we have a range slice object along the axes used for decomposition. """ from omsi.shared.log import log_helper import time rank = get_rank(comm=self.comm) size = get_size(comm=self.comm) if size < 2: warnings.warn('DYNAMIC task scheduling requires at least 2 MPI ranks. Using STATIC scheduling instead.') return self.__run_static_1D() # We are the controlling rank if rank == self.root: self.result = [] self.blocks = [] self.block_times = [] # Get data shape parameters and compute the data blocks axes_shapes = np.asarray(self.main_data.shape)[self.split_axes] total_num_subblocks = np.prod(axes_shapes) if total_num_subblocks < size: if rank == self.root: warnings.warn("Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle") # Compute the list of all possible blocks base_blocks = [[slice(None)]] * len(self.main_data.shape) for axis_index in self.split_axes: base_blocks[axis_index] = range(self.main_data.shape[axis_index]) block_tuples = itertools.product(*base_blocks) # Communicate blocks with task ranks log_helper.info(__name__, "PROCESSING DATA BLOCKS") start_time = time.time() block_index = 0 for block_selection in block_tuples: request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) self.comm.send((block_index, block_selection), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) block_index += 1 if (block_index % 100) == 0: log_helper.debug(__name__, str((block_index, total_num_subblocks, request_rank))) end_time = time.time() run_time = end_time - start_time log_helper.info(__name__, "TIME FOR SCHEDULING ALL TASKS: " + str(run_time)) start_time = time.time() log_helper.info(__name__, "FINALIZING") # Terminate all ranks and receive all data from the different ranks if requested all_ranks_status = np.zeros(size, 'bool') all_ranks_status[self.root] = True while not np.all(all_ranks_status): request_rank = self.comm.recv(source=MPI.ANY_SOURCE, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) self.comm.send((None, None), dest=request_rank, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) all_ranks_status[request_rank] = True end_time = time.time() run_time = end_time - start_time log_helper.info(__name__, "TIME FOR FINALIZING TASKS: " + str(run_time)) # We are a rank that has to run tasks else: # Request a new data block self.result = [] self.blocks = [] self.block_times = [] while True: start_time = time.time() self.comm.send(rank, dest=self.root, tag=self.MPI_MESSAGE_TAGS['RANK_MSG']) block_index, block_selection = self.comm.recv(source=self.root, tag=self.MPI_MESSAGE_TAGS['BLOCK_MSG']) if block_index is None: break # Execute the task_function on the given data block task_params = self.task_function_params task_params[self.main_data_param_name] = self.main_data[block_selection] self.result.append(self.task_function(**task_params)) self.blocks.append(block_selection) # Record the timings end_time = time.time() run_time = end_time - start_time self.block_times.append(run_time) # Return the result return self.result, self.blocks
def __run_static_1D(self): """ Run the task function using a static task decomposition schema. The data is divided into sub-blocks along the largest split_axis :return: Tuple with the following elements: 1) List with the results from the local execution of the task_function. Each entry is the result from one return of the task_function. In the case of static execution, this is always a list of length 1. 2) List of block_indexes. Each block_index is a tuple with the selection used to divide the data into sub-blocks. In the case of static decomposition we have a range slice object along the axes used for decomposition. """ from omsi.shared.log import log_helper start_time = time.time() # Get MPI parameters rank = get_rank(comm=self.comm) size = get_size(comm=self.comm) # Get data shape parameters and compute the data blocks # Determine the longest axis along which we can split the data axes_shapes = np.asarray(self.main_data.shape)[self.split_axes] total_num_subblocks = np.prod(axes_shapes) if total_num_subblocks < size: size = total_num_subblocks if rank == self.root: log_helper.info(__name__, "Insufficient number of blocks for number of MPI ranks. Some ranks will remain idle") axes_sort_index = np.argsort(axes_shapes)[::-1] split_axis = self.split_axes[axes_sort_index[0]] split_axis_size = axes_shapes[split_axis] if split_axis_size < size: raise NotImplementedError("STATIC scheduling currently parallelizes only over one axis, " + "and the largest axis is too small to fill all MPI tasks") # Determine the size of 1D block block_size = int(split_axis_size / float(size) + 0.5) if block_size * size > split_axis_size and block_size > 1: block_size -= 1 # Compute a block for every rank self.blocks = [slice(None)] * len(self.main_data.shape) start_index = rank * block_size stop_index = start_index + block_size if rank == (size-1): if stop_index != split_axis_size: stop_index = split_axis_size self.blocks[axes_sort_index[0]] = slice(start_index, stop_index) self.blocks = tuple(self.blocks) log_helper.info(__name__, "Rank: " + str(rank) + " Block: " + str(self.blocks)) # Execute the task_function on the given data block task_params = self.task_function_params task_params[self.main_data_param_name] = self.main_data[self.blocks] self.result = self.task_function(**task_params) end_time = time.time() run_time = end_time - start_time self.block_times = [run_time, ] log_helper.info(__name__, "TIME FOR PROCESSING THE DATA BLOCK: " + str(run_time)) # Return the output self.result = [self.result, ] self.blocks = [self.blocks, ] return self.result, self.blocks
def main(self): """ Execute the analysis workflow """ # Do the optional MPI barrier if self['synchronize']: mpi_helper.barrier(comm=self.mpi_comm) # Check if we have anything to do at all if len(self.get_analyses()) == 0: log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm) return # Add all dependencies to the workflow log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm) log_helper.debug(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm) self.add_analysis_dependencies() # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet) log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm) all_analyses = self.get_analyses() iterations = 0 continue_running = True while continue_running: # Run all analyses that are ready for analysis in all_analyses: if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0: log_helper.debug(__name__, "Execute analysis: " + str(analysis), root=self.mpi_root, comm=self.mpi_comm) analysis.execute() if self['reduce_memory_usage']: analysis.clear_and_restore() # Check if there is any other tasks that we need to execute now num_tasks_completed, num_tasks_waiting, num_tasks_ready, num_tasks_blocked = \ all_analyses.task_status_stats() if num_tasks_waiting == 0: log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm) continue_running = False if num_tasks_waiting > 0 and num_tasks_ready == 0: blocking_tasks = all_analyses.get_blocking_tasks() log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks_waiting) + " remain in the queue but cannot be completed due to unresolved dependencies." + " The workflow will be restarted once the outputs of the blocking tasks are ready." + " Blocking tasks are: " + str(blocking_tasks), root=self.mpi_root, comm=self.mpi_comm) # Tell all blocking tasks that they should continue the workflow once they are ready # This happens in omsi.analysis.analysis_base.outputs_ready(...) function for block_task in blocking_tasks: block_task.continue_workflow_when_ready(self) # NOTE: if self['reduce_memory_usage'] is True then prior analyses were cleared, i.e., # they will be rexecuted when the workflow is restarted. It is, therefore, not recommeneded # to use reduce_memory_usage option when performing interactive tasks. continue_running = False iterations += 1 # All analyses are done, so we no longer need to coninue any analyses when we are done if num_tasks_blocked == 0: for analysis in all_analyses: analysis.continue_analysis_when_ready = False log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
def main(self): """ Default main function for running an analysis from the command line. The default implementation exposes all specified analysis parameters as command line options to the user. The default implementation also provides means to print a help text for the function. :raises: ValueError is raised in case that the analysis class is unknown """ # Initialize the argument parser if self.parser is None: self.initialize_argument_parser() try: # Parse the command line arguments to determine the command line driver settings self.parse_cl_arguments() except: self.remove_output_target() raise if self.workflow_executor is None: self.remove_output_target() log_helper.error( __name__, 'Missing --script parameter or worfklow_executor object') raise ValueError('Workflow not initalized') # Add and parse the command line arguments specific to the analysis to determine the analysis settings try: self.add_and_parse_workflow_arguments() except: self.remove_output_target() raise # Print the analysis settings if mpi_helper.get_rank() == self.mpi_root: self.print_settings() # Enable time and usage profiling try: # Enable time and usage profiling if requested if self.profile_analyses: try: self.workflow_executor.analysis_tasks.enable_time_and_usage_profiling( self.profile_analyses) except ImportError as e: log_helper.warning( __name__, "Profiling of time and usage not available due to missing packages." ) log_helper.warning(__name__, e.message) # Enable memory profiling if requested if self.profile_analyses_mem: try: self.workflow_executor.analysis_tasks.enable_memory_profiling( self.profile_analyses_mem) except ImportError as e: log_helper.warning( __name__, "Profiling of memory usage not available due to missing packages" ) log_helper.warning(__name__, e.message) except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Execute the analysis try: log_helper.debug(__name__, 'Analysis arguments: ' + str(self.analysis_arguments), root=self.mpi_root, comm=self.mpi_comm) self.workflow_executor.execute() except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial # the condition of mpi_helper.get_rank() == self.mpi_root evaluates to True because # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial. if mpi_helper.get_rank() == self.mpi_root: # Print usage profiles if available try: self.print_time_and_usage_profiles() except: log_helper.error( __name__, "An error occured while trying to print time and usage profiles", root=self.mpi_root, comm=self.mpi_comm) # Print memory profile data if available try: self.print_memory_profiles() except: log_helper.error( __name__, "An error occured while trying to print memory profiles", root=self.mpi_root, comm=self.mpi_comm) # Print the time it took to run the analysis try: # Parallel case: We need to compile/collect timing data from all cores if isinstance( self.workflow_executor.run_info['execution_time'], list): # Time for each task to execute log_helper.info( __name__, "Time in seconds for each analysis process: " + str(self.workflow_executor.run_info['execution_time']), root=self.mpi_root, comm=self.mpi_comm) # Start times of each task log_helper.info( __name__, "Time when each of the processes started: " + str(self.workflow_executor.run_info['start_time']), root=self.mpi_root, comm=self.mpi_comm) # Stop times for each task log_helper.info( __name__, "Time when each of the processes finished: " + str(self.workflow_executor.run_info['end_time']), root=self.mpi_root, comm=self.mpi_comm) # Compile the time to execute string exec_time_array = np.asarray( self.workflow_executor.run_info['execution_time'], dtype=float) max_exec_time = str(exec_time_array.max()) min_exec_time = str(exec_time_array.min()) mean_exec_time = str(exec_time_array.mean()) exec_time_string = max_exec_time + " s " + \ " ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )" # Serial case: We only have a single time to worry about else: exec_time_string = str(self.workflow_executor. run_info['execution_time']) + " s" log_helper.info(__name__, "Time to execute analysis: " + exec_time_string, root=self.mpi_root, comm=self.mpi_comm) except: raise # Save the analysis to file if self.output_target is not None: from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager for analysis in self.workflow_executor.analysis_tasks: omsi_analysis_manager.create_analysis_static( analysis_parent=self.output_target, analysis=analysis)
def main(self): """ Default main function for running an analysis from the command line. The default implementation exposes all specified analysis parameters as command line options to the user. The default implementation also provides means to print a help text for the function. :raises: ValueError is raised in case that the analysis class is unknown """ # Get the analysis object if needed if self.add_analysis_class_arg: try: self.get_analysis_class_from_cl() except (ImportError, AttributeError, ValueError): pass # Initialize the argument parser if self.parser is None: self.initialize_argument_parser() # Check if we have a valid analysis class if self.analysis_class is None: print self.parser.print_help() raise ValueError('Could not determine the analysis class.') if not issubclass(self.analysis_class, analysis_base): print self.parser.print_help() raise ValueError('Analysis class is not a subclass of analysis_base.') try: # Parse the command line arguments to determine the command line driver settings self.parse_cl_arguments() # Add and parse the command line arguments specific to the analysis to determine the analysis settings self.add_and_parse_analysis_arguments() except: self.remove_output_target() raise # Print the analysis settings if mpi_helper.get_rank() == self.mpi_root: self.print_settings() # Call the execute function of the analysis try: # Create the analysis object if self.analysis_object is None: self.create_analysis_object() # Execute the analysis log_helper.debug(__name__, 'Analysis arguments: ' + str(self.analysis_arguments), root=self.mpi_root, comm=self.mpi_comm) self.analysis_object.execute(**self.analysis_arguments) except: if mpi_helper.get_rank() == self.mpi_root: self.remove_output_target() raise # Finalize the saving of results on rank our mpi root rank. NOTE: When running in serial # the condition of mpi_helper.get_rank() == self.mpi_root evaluates to True because # our mpi_root is 0 and the mpi_helper returns 0 for the rank when running in serial. if mpi_helper.get_rank() == self.mpi_root: # Print the profiling results of time and usage if self.analysis_object['profile_time_and_usage']: print "" print "PROFILING DATA: TIME AND USAGE" print "" self.analysis_object.get_profile_stats_object(consolidate=True).print_stats() # Print the profiling results for memory usage if self.analysis_object['profile_memory']: print "" print "PROFILING DATA: MEMORY" print "" print self.analysis_object.get_memory_profile_info() # Print the time it took to run the analysis try: # Parallel case: We need to compile/collect timing data from all cores if isinstance(self.analysis_object.run_info['execution_time'], list): # Time for each task to execute log_helper.info(__name__, "Time in seconds for each analysis process: " + str(self.analysis_object.run_info['execution_time']), root=self.mpi_root, comm=self.mpi_comm) # Start times of each task log_helper.info(__name__, "Time when each of the processes started: " + str(self.analysis_object.run_info['start_time']), root=self.mpi_root, comm=self.mpi_comm) # Stop times for each task log_helper.info(__name__, "Time when each of the processes finished: " + str(self.analysis_object.run_info['end_time']), root=self.mpi_root, comm=self.mpi_comm) # Compile the time to execute string exec_time_array = np.asarray(self.analysis_object.run_info['execution_time'], dtype=float) max_exec_time = str(exec_time_array.max()) min_exec_time = str(exec_time_array.min()) mean_exec_time = str(exec_time_array.mean()) exec_time_string = max_exec_time + " s " + \ " ( min = " + min_exec_time + " , mean = " + mean_exec_time + " )" # Serial case: We only have a single time to worry about else: exec_time_string = str(self.analysis_object.run_info['execution_time']) + " s" log_helper.info(__name__, "Time to execute analysis: " + exec_time_string, root=self.mpi_root, comm=self.mpi_comm) except: raise # Save the analysis to file if self.output_target is not None: from omsi.dataformat.omsi_file.analysis import omsi_analysis_manager omsi_analysis_manager.create_analysis_static(analysis_parent=self.output_target, analysis=self.analysis_object)
def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None): """ Execute the local peak finder for the given msidata. :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra that should be processed by this MPI task. If spectrum_indexes is set, then the given subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass. :returns: A series of numpy arrays with the score data for each pixel and a 2D array of pixel indices describing for each spectrum the (x,y) pixel location in the image. ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match'] * 'pixel_index' , int, 2D array of pixel indices describing for each spectrum \ the (x,y) pixel location in the imag * 'score', float, MIDAS score of row * 'id', str, database ID e.g. 'MetaCyC_7884' * 'name', str, database name, e.g. 'glycine' * 'mass', float, mass in Da of IDed compound * 'n_peaks', int, number of peaks in data * 'n_match', int, number of peaks in data matched """ log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root) # Get the data we need to process fpl_data = self['fpl_data'] fpl_peak_mz = fpl_data['peak_mz'] fpl_peak_value = fpl_data['peak_value'] fpl_peak_arrayindex = fpl_data['peak_arrayindex'] # Calculate the parent_mass precursor_mz = self['precursor_mz'] if precursor_mz == -1: precursor_mz = self['fpl_data']['precursor_mz'][:] # Assign parameter settings to local variables for convenience metabolite_database = self['metabolite_database'] ms1_mass_tol = self['ms1_mass_tolerance'] ms2_mass_tol = self['ms2_mass_tolerance'] neutralizations = self['neutralizations'] max_depth = self['max_depth'] # Make the numpy array with the list of tree files and their MS1 masses if file_lookup_table is None: # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root) if os.path.isfile(self['trees']): if self['trees'].endswith('.npy'): file_lookup_table = np.load(self['trees']) else: in_treefile = open(self['trees'], 'r') tree_files = [line.rstrip('\n') for line in in_treefile] in_treefile.close() file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(tree_files=tree_files) elif os.path.isdir(self['trees']): file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(path=self['trees']) # Define the common pactolus paramters pactolus_parameters = {'file_lookup_table': file_lookup_table, 'ms1_mass_tol': ms1_mass_tol, 'ms2_mass_tol': ms2_mass_tol, 'neutralizations': neutralizations, 'max_depth': max_depth} # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array # where we can find the spectrum that we need to processes num_spectra = fpl_peak_arrayindex.shape[0] if spectrum_indexes is None: # Get the complete peak array index data spectrum_indexes = np.arange(0, num_spectra) enable_parallel = True else: if isinstance(spectrum_indexes, int): spectrum_indexes = np.asarray([spectrum_indexes, ]) enable_parallel = False ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if enable_parallel: log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root) # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = [0, ] scheduler = mpi_helper.parallel_over_axes( task_function=self.execute_analysis, # Execute this function task_function_params={'file_lookup_table': file_lookup_table}, # Reuse the file_lookup_table main_data=spectrum_indexes, # Process the spectra independently split_axes=split_axis, # Split along axes main_data_param_name='spectrum_indexes', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel scheduling scheme comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # Compile the data from the parallel execution pixel_index = np.zeros((0, 2), dtype='int') score = np.zeros((0,), dtype='f4') id_data = np.zeros((0,), dtype='a100') name = np.zeros((0,), dtype='a100') mass = np.zeros((0,), dtype='f4') n_peaks = np.zeros((0,), dtype='i4') n_match = np.zeros((0,), dtype='i4') use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # TODO NEED to update since collect now returns a single list not a list of lists if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule): # We did not process any data on the root process when using dynamic scheduling # and we did not collect the data to the root either pass # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root: # temp_data = [ri[0] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # hit_table = np.concatenate(tuple(temp_data), axis=-1) # temp_data = [ri[1] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1 else: log_helper.debug(__name__, 'Compiling output') # Compile pixel_index temp_data = [ri[0] for ri in result[0]] if len(temp_data) > 0: pixel_index = np.concatenate(tuple(temp_data), axis=0) temp_data = [ri[1] for ri in result[0]] # Compile scores if len(temp_data) > 0: score = np.concatenate(tuple(temp_data), axis=0) # Compile id temp_data = [ri[2] for ri in result[0]] if len(temp_data) > 0: id_data = np.concatenate(tuple(temp_data), axis=0) # Compile name temp_data = [ri[3] for ri in result[0]] if len(temp_data) > 0: name = np.concatenate(tuple(temp_data), axis=0) # Compile mass temp_data = [ri[4] for ri in result[0]] if len(temp_data) > 0: mass = np.concatenate(tuple(temp_data), axis=0) # Compile n_peaks temp_data = [ri[5] for ri in result[0]] if len(temp_data) > 0: n_peaks = np.concatenate(tuple(temp_data), axis=0) # Compile n_match temp_data = [ri[6] for ri in result[0]] if len(temp_data) > 0: n_match = np.concatenate(tuple(temp_data), axis=0) log_helper.log_var(__name__, score=score) # Return the compiled output return pixel_index, score, id_data, name, mass, n_peaks, n_match ############################################################# # Serial processing of the current data block ############################################################# log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root) # Initialize the output data structures # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] # if len(pixel_index.shape) == 1: # pixel_index = pixel_index[np.newaxis, :] hit_matrix = [] # Iterate through all the pixel we were asked to process in serial for current_index, spectrum_index in enumerate(spectrum_indexes): # Determine the start and stop index for the m/z and intensity data of the current spectrum start = int(fpl_peak_arrayindex[spectrum_index, 2]) stop = int(fpl_peak_arrayindex[(spectrum_index+1), 2] if spectrum_index < (num_spectra-1) else fpl_peak_value.size) spectrum_length = stop - start # Skip empty spectra if spectrum_length == 0: time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored." log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) continue # Load the m/z and intensity values for the current spectrum current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float) current_peaks_list[:, 0] = fpl_peak_mz[start:stop] current_peaks_list[:, 1] = fpl_peak_value[start:stop] # Get the parent mass current_parent_mass = precursor_mz if len(precursor_mz) == 1 else precursor_mz[spectrum_index] start_time = time.time() # Call MIDAS to score the current spectrum against all compounds in the database current_hits = score_frag_dag.score_scan_list_against_trees(scan_list=[current_peaks_list, ], ms1_mz=[current_parent_mass, ], params=pactolus_parameters) end_time = time.time() execution_time = end_time - start_time time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time) time_str += " : num hits : " + str((current_hits > 0).sum()) #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) #sys.stdout.flush() print time_str sys.stdout.flush() # Save the hits for the current pixel hit_matrix.append(current_hits[0, :]) # Index the results based on the given metabolite database score = [] id_data = [] name = [] mass = [] n_peaks = [] n_match = [] pixel_index = [] if len(metabolite_database) > 0: # We don't have an empty string for current_index, spectrum_index in enumerate(spectrum_indexes): non_zero_scores = np.where(hit_matrix[current_index] > 0) if non_zero_scores.size > 0: current_hit_table = np.asarray(score_frag_dag.make_pactolus_hit_table( pactolus_results=hit_matrix[current_index], table_file=file_lookup_table, original_db=metabolite_database)) for score_index in non_zero_scores: pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2]) score.append(current_hit_table['score'][score_index]) id_data.append(current_hit_table['id'][score_index]) name.append(current_hit_table['name'][score_index]) mass.append(current_hit_table['mass'][score_index]) n_peaks.append(current_hit_table['n_peaks'][score_index]) n_match.append(current_hit_table['n_match'][score_index]) else: pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] score = np.asarray(hit_matrix) # Return the hit_table and the index of the pixel each hit_table applies to print "rank : " + str(mpi_helper.get_rank()) + " : scores " + str(score) sys.stdout.flush() return np.asarray(pixel_index), \ np.asarray(score), \ np.asarray(id_data), \ np.asarray(name), \ np.asarray(mass), \ np.asarray(n_peaks), \ np.asarray(n_match)
def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None): """ Execute the local peak finder for the given msidata. :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra that should be processed by this MPI task. If spectrum_indexes is set, then the given subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL (if available). This parameter is strictly optional and intended for internal use only to facilitate the efficient parallel implementation. :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass. :returns: A series of numpy arrays with the score data for each pixel and a 2D array of pixel indices describing for each spectrum the (x,y) pixel location in the image. ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match'] * 'pixel_index' , int, 2D array of pixel indices describing for each spectrum \ the (x,y) pixel location in the imag * 'score', float, MIDAS score of row * 'id', str, database ID e.g. 'MetaCyC_7884' * 'name', str, database name, e.g. 'glycine' * 'mass', float, mass in Da of IDed compound * 'n_peaks', int, number of peaks in data * 'n_match', int, number of peaks in data matched """ log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root) # Get the data we need to process fpl_data = self['fpl_data'] fpl_peak_mz = fpl_data['peak_mz'] fpl_peak_value = fpl_data['peak_value'] fpl_peak_arrayindex = fpl_data['peak_arrayindex'] # Calculate the parent_mass precursor_mz = self['precursor_mz'] if precursor_mz == -1: precursor_mz = self['fpl_data']['precursor_mz'][:] # Assign parameter settings to local variables for convenience metabolite_database = self['metabolite_database'] ms1_mass_tol = self['ms1_mass_tolerance'] ms2_mass_tol = self['ms2_mass_tolerance'] neutralizations = self['neutralizations'] max_depth = self['max_depth'] # Make the numpy array with the list of tree files and their MS1 masses if file_lookup_table is None: # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root) if os.path.isfile(self['trees']): if self['trees'].endswith('.npy'): file_lookup_table = np.load(self['trees']) else: in_treefile = open(self['trees'], 'r') tree_files = [line.rstrip('\n') for line in in_treefile] in_treefile.close() file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass( tree_files=tree_files) elif os.path.isdir(self['trees']): file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass( path=self['trees']) # Define the common pactolus paramters pactolus_parameters = { 'file_lookup_table': file_lookup_table, 'ms1_mass_tol': ms1_mass_tol, 'ms2_mass_tol': ms2_mass_tol, 'neutralizations': neutralizations, 'max_depth': max_depth } # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array # where we can find the spectrum that we need to processes num_spectra = fpl_peak_arrayindex.shape[0] if spectrum_indexes is None: # Get the complete peak array index data spectrum_indexes = np.arange(0, num_spectra) enable_parallel = True else: if isinstance(spectrum_indexes, int): spectrum_indexes = np.asarray([ spectrum_indexes, ]) enable_parallel = False ############################################################# # Parallel execution using MPI ############################################################# # We have more than a single core AND we have multiple spectra to process if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1: # We were not asked to process a specific data subblock from a parallel process # but we need to initiate the parallel processing. if enable_parallel: log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root) # Setup the parallel processing using mpi_helper.parallel_over_axes split_axis = [ 0, ] scheduler = mpi_helper.parallel_over_axes( task_function=self. execute_analysis, # Execute this function task_function_params={ 'file_lookup_table': file_lookup_table }, # Reuse the file_lookup_table main_data= spectrum_indexes, # Process the spectra independently split_axes=split_axis, # Split along axes main_data_param_name='spectrum_indexes', # data input param root=self.mpi_root, # The root MPI task schedule=self['schedule'], # Parallel scheduling scheme comm=self.mpi_comm) # MPI communicator # Execute the analysis in parallel result = scheduler.run() # Collect the output data to the root rank if requested if self['collect']: result = scheduler.collect_data() # Compile the data from the parallel execution pixel_index = np.zeros((0, 2), dtype='int') score = np.zeros((0, ), dtype='f4') id_data = np.zeros((0, ), dtype='a100') name = np.zeros((0, ), dtype='a100') mass = np.zeros((0, ), dtype='f4') n_peaks = np.zeros((0, ), dtype='i4') n_match = np.zeros((0, ), dtype='i4') use_dynamic_schedule = ( self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC']) # TODO NEED to update since collect now returns a single list not a list of lists if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule): # We did not process any data on the root process when using dynamic scheduling # and we did not collect the data to the root either pass # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root: # temp_data = [ri[0] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # hit_table = np.concatenate(tuple(temp_data), axis=-1) # temp_data = [ri[1] for rt in result[0] for ri in rt] # if len(temp_data) > 0: # pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1 else: log_helper.debug(__name__, 'Compiling output') # Compile pixel_index temp_data = [ri[0] for ri in result[0]] if len(temp_data) > 0: pixel_index = np.concatenate(tuple(temp_data), axis=0) temp_data = [ri[1] for ri in result[0]] # Compile scores if len(temp_data) > 0: score = np.concatenate(tuple(temp_data), axis=0) # Compile id temp_data = [ri[2] for ri in result[0]] if len(temp_data) > 0: id_data = np.concatenate(tuple(temp_data), axis=0) # Compile name temp_data = [ri[3] for ri in result[0]] if len(temp_data) > 0: name = np.concatenate(tuple(temp_data), axis=0) # Compile mass temp_data = [ri[4] for ri in result[0]] if len(temp_data) > 0: mass = np.concatenate(tuple(temp_data), axis=0) # Compile n_peaks temp_data = [ri[5] for ri in result[0]] if len(temp_data) > 0: n_peaks = np.concatenate(tuple(temp_data), axis=0) # Compile n_match temp_data = [ri[6] for ri in result[0]] if len(temp_data) > 0: n_match = np.concatenate(tuple(temp_data), axis=0) log_helper.log_var(__name__, score=score) # Return the compiled output return pixel_index, score, id_data, name, mass, n_peaks, n_match ############################################################# # Serial processing of the current data block ############################################################# log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root) # Initialize the output data structures # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] # if len(pixel_index.shape) == 1: # pixel_index = pixel_index[np.newaxis, :] hit_matrix = [] # Iterate through all the pixel we were asked to process in serial for current_index, spectrum_index in enumerate(spectrum_indexes): # Determine the start and stop index for the m/z and intensity data of the current spectrum start = int(fpl_peak_arrayindex[spectrum_index, 2]) stop = int(fpl_peak_arrayindex[(spectrum_index + 1), 2] if spectrum_index < (num_spectra - 1) else fpl_peak_value.size) spectrum_length = stop - start # Skip empty spectra if spectrum_length == 0: time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored." log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) continue # Load the m/z and intensity values for the current spectrum current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float) current_peaks_list[:, 0] = fpl_peak_mz[start:stop] current_peaks_list[:, 1] = fpl_peak_value[start:stop] # Get the parent mass current_parent_mass = precursor_mz if len( precursor_mz) == 1 else precursor_mz[spectrum_index] start_time = time.time() # Call MIDAS to score the current spectrum against all compounds in the database current_hits = score_frag_dag.score_scan_list_against_trees( scan_list=[ current_peaks_list, ], ms1_mz=[ current_parent_mass, ], params=pactolus_parameters) end_time = time.time() execution_time = end_time - start_time time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \ str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time) time_str += " : num hits : " + str((current_hits > 0).sum()) #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None) #sys.stdout.flush() print time_str sys.stdout.flush() # Save the hits for the current pixel hit_matrix.append(current_hits[0, :]) # Index the results based on the given metabolite database score = [] id_data = [] name = [] mass = [] n_peaks = [] n_match = [] pixel_index = [] if len(metabolite_database) > 0: # We don't have an empty string for current_index, spectrum_index in enumerate(spectrum_indexes): non_zero_scores = np.where(hit_matrix[current_index] > 0) if non_zero_scores.size > 0: current_hit_table = np.asarray( score_frag_dag.make_pactolus_hit_table( pactolus_results=hit_matrix[current_index], table_file=file_lookup_table, original_db=metabolite_database)) for score_index in non_zero_scores: pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2]) score.append(current_hit_table['score'][score_index]) id_data.append(current_hit_table['id'][score_index]) name.append(current_hit_table['name'][score_index]) mass.append(current_hit_table['mass'][score_index]) n_peaks.append( current_hit_table['n_peaks'][score_index]) n_match.append( current_hit_table['n_match'][score_index]) else: pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2] score = np.asarray(hit_matrix) # Return the hit_table and the index of the pixel each hit_table applies to print "rank : " + str( mpi_helper.get_rank()) + " : scores " + str(score) sys.stdout.flush() return np.asarray(pixel_index), \ np.asarray(score), \ np.asarray(id_data), \ np.asarray(name), \ np.asarray(mass), \ np.asarray(n_peaks), \ np.asarray(n_match)
def __compute_file_info(cls, filename, resolution): ## TODO completely refactor this to make it smartly handle profile or centroid datasets ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution ## TODO: profile datasets should work as is ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array'] """ Internal helper function used to compute the mz axis, data type for the intensities, format type :return: Numpy array with mz axis :return: string with data type :return: imzml file type :return: """ reader = ImzMLParser(filename) # Read the first spectrum mz_axes, intens = reader.getspectrum(0) # NOTE: mz_axes is a tuple # Read the coordinates coordinates = np.asarray(reader.coordinates) # #Start the data at [0,0,0] # coordinates[:,0] = coordinates[:,0] - np.amin(coordinates,axis=0)[0] # coordinates[:,1] = coordinates[:,1] - np.amin(coordinates,axis=0)[1] # coordinates[:,2] = coordinates[:,2] - np.amin(coordinates,axis=0)[2] # Determine the data type for the internsity values dtype = np.asarray(intens).dtype.str # Compute the mz axis and file type file_type = cls.available_imzml_types['continuous'] min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes) for ind in range(coordinates.shape[0]): #for ind, loc in enumerate(reader.coordinates): mz, intens = reader.getspectrum(ind) if mz == mz_axes: pass else: file_type = cls.available_imzml_types['processed'] if min_mz > np.amin(mz): min_mz = np.amin(mz) if max_mz < np.amax(mz): max_mz = np.amax(mz) # Reinterpolate the mz-axis if we have a processed mode imzml file if file_type == cls.available_imzml_types['processed']: f = np.ceil(1e6 * np.log(max_mz/min_mz)/resolution) mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f) log_helper.info(__name__, "Reinterpolated m/z axis for processed imzML file") # Construct the imzml metadata information dataset_metadata = metadata_dict() instrument_metadata = metadata_dict() method_metadata = metadata_dict() for k, v in reader.imzmldict.iteritems(): dataset_metadata[k] = metadata_value(name=k, value=v, unit=None, description=k, ontology=None) # Delete the parser and read the metadata del reader # Parse the metadata for the file. We try to parse only the header and ignore the # <run > group in the XML file to avoid going throught the whole file again # while extracting the majority of the relevant metadata try: with open(filename, 'r') as ins: metdata_header = '' for line in ins: if '<run' in line: break else: metdata_header += line metdata_header += '</mzML>' metdata_header_dict = xmltodict.parse(metdata_header)['mzML'] for k, v in metdata_header_dict.iteritems(): store_value = metadata_value(name=k, value=v, unit=None, description=str(k) + " extracted from imzML XML header.", ontology=None) if k == 'instrumentConfigurationList': instrument_metadata[k] = store_value elif k == 'dataProcessingList': method_metadata[k] = store_value elif k == 'scanSettingsList': dataset_metadata[k] = store_value elif k == 'softwareList': method_metadata[k] = store_value elif k =='sampleList': method_metadata[k] = store_value else: dataset_metadata[k] = store_value dataset_metadata['imzml_xml_metadata_header'] = metadata_value(name='imzml_xml_metadata_header', value=metdata_header, unit=None, description='XML imzML header', ontology=None) except: log_helper.warning(__name__, "Extraction of additional imzML metadata failed") return coordinates, np.asarray(mz_axes), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata