Beispiel #1
0
    def __init__(self,
                 analysis_objects=None):
        """
        Initialize the workflow executor

        :param analysis_objects: A list of analysis objects to be executed
        """
        super(workflow_executor_base, self).__init__()
        log_helper.debug(__name__, "Creating workflow executor")
        if analysis_objects is not None:
            if not isinstance(analysis_objects, list) and not isinstance(analysis_objects, set):
                analysis_objects = [analysis_objects, ]
        log_helper.log_var(__name__, analysis_objects=analysis_objects, level='DEBUG')
        self.run_info = run_info_dict()
        self.analysis_tasks = analysis_task_list(analysis_objects) \
            if analysis_objects is not None \
            else analysis_task_list()
        self.mpi_comm = mpi_helper.get_comm_world()
        self.mpi_root = 0
        self.workflow_identifier = "we"
        # self.parameters = []  # Inherited from parameter_manager and set in parent class

        dtypes = data_dtypes.get_dtypes()
        self.add_parameter(name='profile_time_and_usage',
                           help='Enable/disable profiling of time and usage of the whole workflow',
                           required=False,
                           default=False,
                           dtype=dtypes['bool'])
        self.add_parameter(name='profile_memory',
                           help='Enable/disable profiling of memory usage of the whole workflow',
                           required=False,
                           default=False,
                           dtype=dtypes['bool'])
    def main(self):
        """Execute the analysis workflow"""
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty")
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow")
        log_helper.info(__name__, "Adding all dependencies")
        self.add_analysis_dependencies()

        # Record the runtime information
        log_helper.debug(__name__, "Recording runtime information")
        self.run_info.clear()
        self.run_info.record_preexecute()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow")
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(
                        analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__,
                                     "Execute analysis: " + str(analysis))
                    analysis.execute()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.")
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(
                    __name__,
                    "Workflow could not be fully executed. " + str(num_tasks) +
                    " remain in the queue but cannot be completed due to unresolved dependencies."
                )
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG')

        # Record the runtime information after we are done with the workflow
        self.run_info.record_postexecute()
        self.run_info.gather()
Beispiel #3
0
    def main(self):
        """
        Execute the analysis workflow
        """
        # Do the optional MPI barrier
        if self['synchronize']:
            mpi_helper.barrier(comm=self.mpi_comm)

        # Check if we have anything to do at all
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm)
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm)
        log_helper.info(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm)
        self.add_analysis_dependencies()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm)
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis),
                                     root=self.mpi_root, comm=self.mpi_comm)
                    analysis.execute()
                    if self['reduce_memory_usage']:
                        analysis.clear_and_restore()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm)
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies.",
                                   root=self.mpi_root, comm=self.mpi_comm)
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
    def main(self):
        """Execute the analysis workflow"""
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty")
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow")
        log_helper.info(__name__, "Adding all dependencies")
        self.add_analysis_dependencies()

        # Record the runtime information
        log_helper.debug(__name__, "Recording runtime information")
        self.run_info.clear()
        self.run_info.record_preexecute()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow")
        all_analyses = self.get_analyses()
        iterations = 0
        while True:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis))
                    analysis.execute()
            # Check if there is any other tasks that we need to execte now
            num_tasks = 0
            num_tasks_ready = 0
            for analysis in all_analyses:
                if analysis.update_analysis:
                    num_tasks += 1
                    if len(analysis.check_ready_to_execute()) == 0:
                        num_tasks_ready += 1
            if num_tasks == 0:
                log_helper.info(__name__, "Completed executing the workflow.")
                break
            if num_tasks > 0 and num_tasks_ready == 0:
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies.")
            iterations += 1

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG')

        # Record the runtime information after we are done with the workflow
        self.run_info.record_postexecute()
        self.run_info.gather()
Beispiel #5
0
    def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass.

        :returns: A series of numpy arrays  with the score data for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image.

            ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match']
                * 'pixel_index'  , int,  2D array of pixel indices describing for each spectrum \
                   the (x,y) pixel location in the imag
                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        log_helper.debug(__name__,
                         'Reading inputs',
                         comm=self.mpi_comm,
                         root=self.mpi_root)
        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']
        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']
        if precursor_mz == -1:
            precursor_mz = self['fpl_data']['precursor_mz'][:]
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        ms1_mass_tol = self['ms1_mass_tolerance']
        ms2_mass_tol = self['ms2_mass_tolerance']
        neutralizations = self['neutralizations']
        max_depth = self['max_depth']

        # Make the numpy array with the list of tree files and their MS1 masses
        if file_lookup_table is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            log_helper.debug(__name__,
                             'Preparing file lookup table',
                             comm=self.mpi_comm,
                             root=self.mpi_root)
            if os.path.isfile(self['trees']):
                if self['trees'].endswith('.npy'):
                    file_lookup_table = np.load(self['trees'])
                else:
                    in_treefile = open(self['trees'], 'r')
                    tree_files = [line.rstrip('\n') for line in in_treefile]
                    in_treefile.close()
                    file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(
                        tree_files=tree_files)
            elif os.path.isdir(self['trees']):
                file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(
                    path=self['trees'])

        # Define the common pactolus paramters
        pactolus_parameters = {
            'file_lookup_table': file_lookup_table,
            'ms1_mass_tol': ms1_mass_tol,
            'ms2_mass_tol': ms2_mass_tol,
            'neutralizations': neutralizations,
            'max_depth': max_depth
        }

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([
                    spectrum_indexes,
                ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                log_helper.debug(__name__,
                                 'Preparing parallel execution',
                                 comm=self.mpi_comm,
                                 root=self.mpi_root)
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [
                    0,
                ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.
                    execute_analysis,  # Execute this function
                    task_function_params={
                        'file_lookup_table': file_lookup_table
                    },  # Reuse the file_lookup_table
                    main_data=
                    spectrum_indexes,  # Process the spectra independently
                    split_axes=split_axis,  # Split along axes
                    main_data_param_name='spectrum_indexes',  # data input param
                    root=self.mpi_root,  # The root MPI task
                    schedule=self['schedule'],  # Parallel scheduling scheme
                    comm=self.mpi_comm)  # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                pixel_index = np.zeros((0, 2), dtype='int')
                score = np.zeros((0, ), dtype='f4')
                id_data = np.zeros((0, ), dtype='a100')
                name = np.zeros((0, ), dtype='a100')
                mass = np.zeros((0, ), dtype='f4')
                n_peaks = np.zeros((0, ), dtype='i4')
                n_match = np.zeros((0, ), dtype='i4')

                use_dynamic_schedule = (
                    self['schedule'] ==
                    mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank()
                                            == self.mpi_root
                                            and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    log_helper.debug(__name__, 'Compiling output')
                    # Compile pixel_index
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                    temp_data = [ri[1] for ri in result[0]]
                    # Compile scores
                    if len(temp_data) > 0:
                        score = np.concatenate(tuple(temp_data), axis=0)
                    # Compile id
                    temp_data = [ri[2] for ri in result[0]]
                    if len(temp_data) > 0:
                        id_data = np.concatenate(tuple(temp_data), axis=0)
                    # Compile name
                    temp_data = [ri[3] for ri in result[0]]
                    if len(temp_data) > 0:
                        name = np.concatenate(tuple(temp_data), axis=0)
                    # Compile mass
                    temp_data = [ri[4] for ri in result[0]]
                    if len(temp_data) > 0:
                        mass = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_peaks
                    temp_data = [ri[5] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_peaks = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_match
                    temp_data = [ri[6] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_match = np.concatenate(tuple(temp_data), axis=0)
                    log_helper.log_var(__name__, score=score)
                # Return the compiled output
                return pixel_index, score, id_data, name, mass, n_peaks, n_match

        #############################################################
        # Serial processing of the current data block
        #############################################################
        log_helper.debug(__name__,
                         'Processing spectra',
                         comm=self.mpi_comm,
                         root=self.mpi_root)
        # Initialize the output data structures
        # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        # if len(pixel_index.shape) == 1:
        #    pixel_index = pixel_index[np.newaxis, :]
        hit_matrix = []

        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = int(fpl_peak_arrayindex[spectrum_index, 2])
            stop = int(fpl_peak_arrayindex[(spectrum_index + 1),
                                           2] if spectrum_index <
                       (num_spectra - 1) else fpl_peak_value.size)
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                           str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                log_helper.info(__name__,
                                time_str,
                                comm=self.mpi_comm,
                                root=None)
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 2),
                                          dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = precursor_mz if len(
                precursor_mz) == 1 else precursor_mz[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = score_frag_dag.score_scan_list_against_trees(
                scan_list=[
                    current_peaks_list,
                ],
                ms1_mz=[
                    current_parent_mass,
                ],
                params=pactolus_parameters)
            end_time = time.time()
            execution_time = end_time - start_time
            time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                       str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str((current_hits > 0).sum())
            #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
            #sys.stdout.flush()
            print time_str
            sys.stdout.flush()

            # Save the hits for the current pixel
            hit_matrix.append(current_hits[0, :])

        # Index the results based on the given metabolite database
        score = []
        id_data = []
        name = []
        mass = []
        n_peaks = []
        n_match = []
        pixel_index = []
        if len(metabolite_database) > 0:  # We don't have an empty string
            for current_index, spectrum_index in enumerate(spectrum_indexes):
                non_zero_scores = np.where(hit_matrix[current_index] > 0)
                if non_zero_scores.size > 0:
                    current_hit_table = np.asarray(
                        score_frag_dag.make_pactolus_hit_table(
                            pactolus_results=hit_matrix[current_index],
                            table_file=file_lookup_table,
                            original_db=metabolite_database))
                    for score_index in non_zero_scores:
                        pixel_index.append(fpl_peak_arrayindex[spectrum_index,
                                                               0:2])
                        score.append(current_hit_table['score'][score_index])
                        id_data.append(current_hit_table['id'][score_index])
                        name.append(current_hit_table['name'][score_index])
                        mass.append(current_hit_table['mass'][score_index])
                        n_peaks.append(
                            current_hit_table['n_peaks'][score_index])
                        n_match.append(
                            current_hit_table['n_match'][score_index])
        else:
            pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
            score = np.asarray(hit_matrix)

        # Return the hit_table and the index of the pixel each hit_table applies to
        print "rank : " + str(
            mpi_helper.get_rank()) + " : scores " + str(score)
        sys.stdout.flush()
        return np.asarray(pixel_index), \
               np.asarray(score), \
               np.asarray(id_data), \
               np.asarray(name), \
               np.asarray(mass), \
               np.asarray(n_peaks), \
               np.asarray(n_match)
Beispiel #6
0
    def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass.

        :returns: A series of numpy arrays  with the score data for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image.

            ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match']
                * 'pixel_index'  , int,  2D array of pixel indices describing for each spectrum \
                   the (x,y) pixel location in the imag
                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root)
        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']
        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']
        if precursor_mz == -1:
            precursor_mz = self['fpl_data']['precursor_mz'][:]
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        ms1_mass_tol = self['ms1_mass_tolerance']
        ms2_mass_tol = self['ms2_mass_tolerance']
        neutralizations = self['neutralizations']
        max_depth = self['max_depth']

        # Make the numpy array with the list of tree files and their MS1 masses
        if file_lookup_table is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root)
            if os.path.isfile(self['trees']):
                if self['trees'].endswith('.npy'):
                    file_lookup_table = np.load(self['trees'])
                else:
                    in_treefile = open(self['trees'], 'r')
                    tree_files = [line.rstrip('\n') for line in in_treefile]
                    in_treefile.close()
                    file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(tree_files=tree_files)
            elif os.path.isdir(self['trees']):
                file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(path=self['trees'])

        # Define the common pactolus paramters
        pactolus_parameters = {'file_lookup_table': file_lookup_table,
                               'ms1_mass_tol': ms1_mass_tol,
                               'ms2_mass_tol': ms2_mass_tol,
                               'neutralizations': neutralizations,
                               'max_depth': max_depth}

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([spectrum_indexes, ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root)
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [0, ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.execute_analysis,                    # Execute this function
                    task_function_params={'file_lookup_table': file_lookup_table},  # Reuse the file_lookup_table
                    main_data=spectrum_indexes,                             # Process the spectra independently
                    split_axes=split_axis,                                  # Split along axes
                    main_data_param_name='spectrum_indexes',                # data input param
                    root=self.mpi_root,                                     # The root MPI task
                    schedule=self['schedule'],                              # Parallel scheduling scheme
                    comm=self.mpi_comm)                                     # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                pixel_index = np.zeros((0, 2), dtype='int')
                score = np.zeros((0,), dtype='f4')
                id_data = np.zeros((0,), dtype='a100')
                name = np.zeros((0,), dtype='a100')
                mass = np.zeros((0,), dtype='f4')
                n_peaks = np.zeros((0,), dtype='i4')
                n_match = np.zeros((0,), dtype='i4')

                use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    log_helper.debug(__name__, 'Compiling output')
                    # Compile pixel_index
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                    temp_data = [ri[1] for ri in result[0]]
                    # Compile scores
                    if len(temp_data) > 0:
                        score = np.concatenate(tuple(temp_data), axis=0)
                    # Compile id
                    temp_data = [ri[2] for ri in result[0]]
                    if len(temp_data) > 0:
                        id_data = np.concatenate(tuple(temp_data), axis=0)
                    # Compile name
                    temp_data = [ri[3] for ri in result[0]]
                    if len(temp_data) > 0:
                        name = np.concatenate(tuple(temp_data), axis=0)
                    # Compile mass
                    temp_data = [ri[4] for ri in result[0]]
                    if len(temp_data) > 0:
                        mass = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_peaks
                    temp_data = [ri[5] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_peaks = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_match
                    temp_data = [ri[6] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_match = np.concatenate(tuple(temp_data), axis=0)
                    log_helper.log_var(__name__, score=score)
                # Return the compiled output
                return pixel_index, score, id_data, name, mass, n_peaks, n_match

        #############################################################
        # Serial processing of the current data block
        #############################################################
        log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root)
        # Initialize the output data structures
        # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        # if len(pixel_index.shape) == 1:
        #    pixel_index = pixel_index[np.newaxis, :]
        hit_matrix = []

        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = int(fpl_peak_arrayindex[spectrum_index, 2])
            stop = int(fpl_peak_arrayindex[(spectrum_index+1), 2]
                   if spectrum_index < (num_spectra-1)
                   else fpl_peak_value.size)
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                           str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = precursor_mz if len(precursor_mz) == 1 else precursor_mz[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = score_frag_dag.score_scan_list_against_trees(scan_list=[current_peaks_list, ],
                                                                        ms1_mz=[current_parent_mass, ],
                                                                        params=pactolus_parameters)
            end_time = time.time()
            execution_time = end_time - start_time
            time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                       str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str((current_hits > 0).sum())
            #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
            #sys.stdout.flush()
            print time_str
            sys.stdout.flush()

            # Save the hits for the current pixel
            hit_matrix.append(current_hits[0, :])

        # Index the results based on the given metabolite database
        score = []
        id_data = []
        name = []
        mass = []
        n_peaks = []
        n_match = []
        pixel_index = []
        if len(metabolite_database) > 0:  # We don't have an empty string
            for current_index, spectrum_index in enumerate(spectrum_indexes):
                non_zero_scores = np.where(hit_matrix[current_index] > 0)
                if non_zero_scores.size > 0:
                    current_hit_table = np.asarray(score_frag_dag.make_pactolus_hit_table(
                        pactolus_results=hit_matrix[current_index],
                        table_file=file_lookup_table,
                        original_db=metabolite_database))
                    for score_index in non_zero_scores:
                        pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2])
                        score.append(current_hit_table['score'][score_index])
                        id_data.append(current_hit_table['id'][score_index])
                        name.append(current_hit_table['name'][score_index])
                        mass.append(current_hit_table['mass'][score_index])
                        n_peaks.append(current_hit_table['n_peaks'][score_index])
                        n_match.append(current_hit_table['n_match'][score_index])
        else:
            pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
            score = np.asarray(hit_matrix)

        # Return the hit_table and the index of the pixel each hit_table applies to
        print "rank : " + str(mpi_helper.get_rank()) + " : scores " + str(score)
        sys.stdout.flush()
        return np.asarray(pixel_index), \
               np.asarray(score), \
               np.asarray(id_data), \
               np.asarray(name), \
               np.asarray(mass), \
               np.asarray(n_peaks), \
               np.asarray(n_match)
Beispiel #7
0
    def __init__(self,
                 hdr_filename=None,
                 t2m_filename=None,
                 img_filename=None,
                 basename=None,
                 requires_slicing=True):
        """Open an img file for data reading.

            :param hdr_filename: The name of the hdr header file
            :type hdr_filename: string

            :param t2m_filename: The name of the t2m_filename
            :type t2m_filename: string

            :param img_filename: The name of the img data file
            :type img_filename: string

            :param basename: Instead of img_filename, t2m_filename, and hdr_filename one may also supply just
                             a single basename. The basename is completed with the .img, .t2m, .hdr extension
                             to load the data.
            :type basename: string

            :param requires_slicing: Unused here. Slicing is always supported by this reader.
            :type requires_slicing: Boolean

            :raises ValueError: In case that basename and hdr_filename, t2m_filename, and img_filename are specified.
        """
        super(img_file, self).__init__(basename, requires_slicing)
        self.data_type = 'uint16'
        self.shape = [
            0, 0, 0
        ]  # Number of pixels in x,y, and z. NOTE: Type changed to tuple later on.
        self.mz = 0  # A numpy vector with the m/z values of the instrument

        if basename and hdr_filename and t2m_filename and img_filename:
            raise ValueError(
                "Conflicting input. Provide either basename or the " +
                "hdr_filename,t2m_filename,img_filename parameters but not both."
            )
        if basename:
            basefile = basename
            if os.path.isdir(basename):
                filelist = self.get_files_from_dir(basename)
                log_helper.log_var(__name__, filelist=filelist)
                if len(filelist) > 0:
                    basefile = filelist[0]
                else:
                    raise ValueError(
                        "No valid img file found in the given directory.")
            elif basefile.endswith(".img") and os.path.exists(basefile):
                basefile = basefile.rstrip(".img")
            elif basefile.endswith(".hdr") and os.path.exists(basefile):
                basefile = basefile.rstrip(".hdr")
            elif basefile.endswith(".t2m") and os.path.exists(basefile):
                basefile = basefile.rstrip(".t2m")

            log_helper.log_var(__name__, basefile=basefile)
            if os.path.exists(basefile + ".hdr") and \
                    os.path.exists(basefile + ".t2m") and \
                    os.path.exists(basefile + ".img"):
                hdr_filename = basefile + ".hdr"
                t2m_filename = basefile + ".t2m"
                img_filename = basefile + ".img"
            else:
                raise ValueError(
                    "No valid img file found for the given basename.")
        elif hdr_filename and t2m_filename and img_filename:
            pass  # Nothing to be done
        else:
            raise ValueError(
                "Missing input parameter. Either provide: " +
                " i) basename or ii) hdr_filename, t2m_filename, img_filename")

        # Initialize the x and y length
        hdr = open(hdr_filename, 'rb')
        hdrdata = np.fromfile(file=hdr_filename, dtype='int16', count=-1)
        self.shape[0] = int(hdrdata[23])
        self.shape[1] = int(hdrdata[22])
        hdr.close()

        # Initialize the z length
        t2m = open(t2m_filename, 'rb')
        self.mz = np.fromfile(file=t2m, dtype='float32', count=-1)
        self.shape[2] = self.mz.shape[0]
        t2m.close()

        # Convert the shape variable to the expected tuple
        self.shape = tuple(self.shape)

        # Open the img file with the spectrum data
        self.img_filename = img_filename
        self.file_opened = False
        try:
            self.m_img_file = np.memmap(filename=self.img_filename,
                                        dtype=self.data_type,
                                        shape=self.shape,
                                        mode='r',
                                        order='C')
            self.file_opened = True
        except ValueError:
            # Check if the size of the file matches what we expect
            imgsize = os.stat(self.img_filename).st_size
            itemsize = np.dtype(self.data_type).itemsize
            expectednumvalues = int(self.shape[0]) * int(self.shape[1]) * int(
                self.shape[2])
            expectedsize = expectednumvalues * int(itemsize)
            sizedifference = expectedsize - imgsize
            log_helper.warning(__name__ , "IMG size: " + str(imgsize) + " Expected size: " + \
                                          str(expectedsize) + "  (difference="+str(sizedifference) + ")")
            if imgsize < expectedsize:
                # Check whether the missing data aligns with images or spectra
                slicesize = int(self.shape[0]) * int(self.shape[1]) * itemsize
                spectrumsize = int(self.shape[2]) * itemsize
                percentmissing = float(sizedifference) / float(expectedsize)
                valuesmissing = float(sizedifference) / itemsize
                warnings.warn("WARNING: Missing " + str(sizedifference) +
                              " bytes in img file (missing " +
                              str(valuesmissing) + " intensity values; " +
                              str(percentmissing) + "%)." +
                              " Expected shape: " + str(self.shape))
                # Define how we should deal with the error
                expandslice = (sizedifference % slicesize) == 0
                expandspectra = (sizedifference % spectrumsize) == 0
                if not expandslice:
                    expandspectra = True
                # Complete missing spectra
                if expandspectra:
                    warnings.warn(
                        "Dealing with missing data in img file by completing last spectra with 0's."
                    )
                    # TODO np.require create an in-memory copy of the full data. Allow usage of memmap'ed tempfile.
                    tempmap = np.require(np.memmap(filename=self.img_filename,
                                                   dtype=self.data_type,
                                                   mode='r',
                                                   order='C'),
                                         requirements=['O', 'C'])
                    # Extend the memmap to the expected size
                    tempmap.resize((expectednumvalues, ))
                    # Reshape the memmap to the expected shape
                    self.m_img_file = tempmap.reshape(self.shape, order='C')
                    self.file_opened = True
                # Complete missing slices
                elif expandslice:
                    slicesmissing = sizedifference / slicesize
                    self.mz = self.mz[:(-slicesmissing)]
                    warnings.warn(
                        "Dealing with missing data in img file by updating he m/z axis.."
                        +
                        " It looks like the m/z axis data may be inconsistent"
                        + " with the binary data. Removing " +
                        str(slicesmissing) + " bins from the m/z axis.")
                    self.shape = list(self.shape)
                    self.shape[2] = self.mz.shape[0]
                    self.shape = tuple(self.shape)
                    self.m_img_file = np.memmap(filename=self.img_filename,
                                                dtype=self.data_type,
                                                shape=self.shape,
                                                mode='r',
                                                order='C')
                    self.file_opened = True
                else:
                    raise
            else:
                raise
        except:
            log_helper.error(
                __name__, "Error while opening the img file: " + img_filename)
            raise
Beispiel #8
0
    def main(self):
        """
        Execute the analysis workflow
        """
        # Do the optional MPI barrier
        if self['synchronize']:
            mpi_helper.barrier(comm=self.mpi_comm)

        # Check if we have anything to do at all
        if len(self.get_analyses()) == 0:
            log_helper.info(__name__, "The workflow is empty", root=self.mpi_root, comm=self.mpi_comm)
            return

        # Add all dependencies to the workflow
        log_helper.debug(__name__, "Executing the workflow", root=self.mpi_root, comm=self.mpi_comm)
        log_helper.debug(__name__, "Adding all dependencies", root=self.mpi_root, comm=self.mpi_comm)
        self.add_analysis_dependencies()

        # Execute the workflow in a greedy fashion (i.e., execute whichever analysis is ready and has not be run yet)
        log_helper.debug(__name__, "Running the analysis workflow", root=self.mpi_root, comm=self.mpi_comm)
        all_analyses = self.get_analyses()
        iterations = 0
        continue_running = True
        while continue_running:
            # Run all analyses that are ready
            for analysis in all_analyses:
                if analysis.update_analysis and len(analysis.check_ready_to_execute()) == 0:
                    log_helper.debug(__name__, "Execute analysis: " + str(analysis),
                                     root=self.mpi_root, comm=self.mpi_comm)
                    analysis.execute()
                    if self['reduce_memory_usage']:
                        analysis.clear_and_restore()
            # Check if there is any other tasks that we need to execute now
            num_tasks_completed, num_tasks_waiting, num_tasks_ready, num_tasks_blocked = \
                all_analyses.task_status_stats()
            if num_tasks_waiting == 0:
                log_helper.info(__name__, "Completed executing the workflow.", root=self.mpi_root, comm=self.mpi_comm)
                continue_running = False
            if num_tasks_waiting > 0 and num_tasks_ready == 0:
                blocking_tasks = all_analyses.get_blocking_tasks()
                log_helper.warning(__name__, "Workflow could not be fully executed. " + str(num_tasks_waiting) +
                                   " remain in the queue but cannot be completed due to unresolved dependencies." +
                                   " The workflow will be restarted once the outputs of the blocking tasks are ready." +
                                   " Blocking tasks are: " + str(blocking_tasks),
                                   root=self.mpi_root, comm=self.mpi_comm)
                # Tell all blocking tasks that they should continue the workflow once they are ready
                # This happens in omsi.analysis.analysis_base.outputs_ready(...) function
                for block_task in blocking_tasks:
                    block_task.continue_workflow_when_ready(self)
                #  NOTE: if self['reduce_memory_usage'] is True then prior analyses were cleared, i.e.,
                #        they will be rexecuted when the workflow is restarted. It is, therefore, not recommeneded
                #        to use reduce_memory_usage option when performing interactive tasks.

                continue_running = False
            iterations += 1
        # All analyses are done, so we no longer need to coninue any analyses when we are done
        if num_tasks_blocked == 0:
            for analysis in all_analyses:
                analysis.continue_analysis_when_ready = False

        log_helper.log_var(__name__, iterations=iterations, level='DEBUG', root=self.mpi_root, comm=self.mpi_comm)
Beispiel #9
0
    def __init__(self, hdr_filename=None, t2m_filename=None, img_filename=None, basename=None, requires_slicing=True):
        """Open an img file for data reading.

            :param hdr_filename: The name of the hdr header file
            :type hdr_filename: string

            :param t2m_filename: The name of the t2m_filename
            :type t2m_filename: string

            :param img_filename: The name of the img data file
            :type img_filename: string

            :param basename: Instead of img_filename, t2m_filename, and hdr_filename one may also supply just
                             a single basename. The basename is completed with the .img, .t2m, .hdr extension
                             to load the data.
            :type basename: string

            :param requires_slicing: Unused here. Slicing is always supported by this reader.
            :type requires_slicing: Boolean

            :raises ValueError: In case that basename and hdr_filename, t2m_filename, and img_filename are specified.
        """
        super(img_file, self).__init__(basename, requires_slicing)
        self.data_type = 'uint16'
        self.shape = [0, 0, 0]  # Number of pixels in x,y, and z. NOTE: Type changed to tuple later on.
        self.mz = 0  # A numpy vector with the m/z values of the instrument

        if basename and hdr_filename and t2m_filename and img_filename:
            raise ValueError(
                "Conflicting input. Provide either basename or the " +
                "hdr_filename,t2m_filename,img_filename parameters but not both.")
        if basename:
            basefile = basename
            if os.path.isdir(basename):
                filelist = self.get_files_from_dir(basename)
                log_helper.log_var(__name__, filelist=filelist)
                if len(filelist) > 0:
                    basefile = filelist[0]
                else:
                    raise ValueError("No valid img file found in the given directory.")
            elif basefile.endswith(".img") and os.path.exists(basefile):
                basefile = basefile.rstrip(".img")
            elif basefile.endswith(".hdr") and os.path.exists(basefile):
                basefile = basefile.rstrip(".hdr")
            elif basefile.endswith(".t2m") and os.path.exists(basefile):
                basefile = basefile.rstrip(".t2m")

            log_helper.log_var(__name__, basefile=basefile)
            if os.path.exists(basefile + ".hdr") and \
                    os.path.exists(basefile + ".t2m") and \
                    os.path.exists(basefile + ".img"):
                hdr_filename = basefile + ".hdr"
                t2m_filename = basefile + ".t2m"
                img_filename = basefile + ".img"
            else:
                raise ValueError("No valid img file found for the given basename.")
        elif hdr_filename and t2m_filename and img_filename:
            pass  # Nothing to be done
        else:
            raise ValueError("Missing input parameter. Either provide: " +
                             " i) basename or ii) hdr_filename, t2m_filename, img_filename")

        # Initialize the x and y length
        hdr = open(hdr_filename, 'rb')
        hdrdata = np.fromfile(file=hdr_filename, dtype='int16', count=-1)
        self.shape[0] = int(hdrdata[23])
        self.shape[1] = int(hdrdata[22])
        hdr.close()

        # Initialize the z length
        t2m = open(t2m_filename, 'rb')
        self.mz = np.fromfile(file=t2m, dtype='float32', count=-1)
        self.shape[2] = self.mz.shape[0]
        t2m.close()

        # Convert the shape variable to the expected tuple
        self.shape = tuple(self.shape)

        # Open the img file with the spectrum data
        self.img_filename = img_filename
        self.file_opened = False
        try:
            self.m_img_file = np.memmap(filename=self.img_filename,
                                        dtype=self.data_type,
                                        shape=self.shape,
                                        mode='r',
                                        order='C')
            self.file_opened = True
        except ValueError:
            # Check if the size of the file matches what we expect
            imgsize = os.stat(self.img_filename).st_size
            itemsize = np.dtype(self.data_type).itemsize
            expectednumvalues = int(self.shape[0]) * int(self.shape[1]) * int(self.shape[2])
            expectedsize = expectednumvalues * int(itemsize)
            sizedifference = expectedsize - imgsize
            log_helper.warning(__name__ , "IMG size: " + str(imgsize) + " Expected size: " + \
                                          str(expectedsize) + "  (difference="+str(sizedifference) + ")")
            if imgsize < expectedsize:
                # Check whether the missing data aligns with images or spectra
                slicesize = int(self.shape[0]) * int(self.shape[1]) * itemsize
                spectrumsize = int(self.shape[2]) * itemsize
                percentmissing = float(sizedifference)/float(expectedsize)
                valuesmissing = float(sizedifference) / itemsize
                warnings.warn("WARNING: Missing "+str(sizedifference) +
                              " bytes in img file (missing " + str(valuesmissing) +
                              " intensity values; "+str(percentmissing)+"%)." +
                              " Expected shape: "+str(self.shape))
                # Define how we should deal with the error
                expandslice = (sizedifference % slicesize) == 0
                expandspectra = (sizedifference % spectrumsize) == 0
                if not expandslice:
                    expandspectra = True
                # Complete missing spectra
                if expandspectra:
                    warnings.warn("Dealing with missing data in img file by completing last spectra with 0's.")
                    # TODO np.require create an in-memory copy of the full data. Allow usage of memmap'ed tempfile.
                    tempmap = np.require(np.memmap(filename=self.img_filename,
                                                   dtype=self.data_type,
                                                   mode='r',
                                                   order='C'),
                                         requirements=['O', 'C'])
                    # Extend the memmap to the expected size
                    tempmap.resize((expectednumvalues, ))
                    # Reshape the memmap to the expected shape
                    self.m_img_file = tempmap.reshape(self.shape, order='C')
                    self.file_opened = True
                # Complete missing slices
                elif expandslice:
                    slicesmissing = sizedifference / slicesize
                    self.mz = self.mz[:(-slicesmissing)]
                    warnings.warn("Dealing with missing data in img file by updating he m/z axis.." +
                                  " It looks like the m/z axis data may be inconsistent" +
                                  " with the binary data. Removing "+str(slicesmissing) +
                                  " bins from the m/z axis.")
                    self.shape = list(self.shape)
                    self.shape[2] = self.mz.shape[0]
                    self.shape = tuple(self.shape)
                    self.m_img_file = np.memmap(filename=self.img_filename,
                                                dtype=self.data_type,
                                                shape=self.shape,
                                                mode='r',
                                                order='C')
                    self.file_opened = True
                else:
                    raise
            else:
                raise
        except:
            log_helper.error(__name__, "Error while opening the img file: " + img_filename)
            raise