Beispiel #1
0
    def write_analysis_data(self, analysis_group=None):
        """
        This function is used to write the actual analysis data to file. If not implemented, then the
        omsi_file_analysis API's default behavior is used instead.

        :param analysis_group: The h5py.Group object where the analysis is stored.

        """
        # Check if a user attempts to do parallel I/O with collect being disabled
        if mpi_helper.get_size() > 1 and not self['collect']:
            # Check if any of the other ranks have data
            num_elements = self['peak_arrayindex'].shape[0] if len(
                self['peak_arrayindex'].shape) == 2 else 0
            result_sizes = mpi_helper.gather(num_elements,
                                             comm=self.mpi_comm,
                                             root=self.mpi_root)
            if mpi_helper.get_rank() == self.mpi_root:
                for element_size in result_sizes[1:]:
                    if element_size > 0:
                        raise ValueError(
                            'Parallel I/O with collect parameter set to false not supported'
                        )
        raise NotImplementedError
        """
Beispiel #2
0
    def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass.

        :returns: A series of numpy arrays  with the score data for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image.

            ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match']
                * 'pixel_index'  , int,  2D array of pixel indices describing for each spectrum \
                   the (x,y) pixel location in the imag
                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        log_helper.debug(__name__,
                         'Reading inputs',
                         comm=self.mpi_comm,
                         root=self.mpi_root)
        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']
        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']
        if precursor_mz == -1:
            precursor_mz = self['fpl_data']['precursor_mz'][:]
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        ms1_mass_tol = self['ms1_mass_tolerance']
        ms2_mass_tol = self['ms2_mass_tolerance']
        neutralizations = self['neutralizations']
        max_depth = self['max_depth']

        # Make the numpy array with the list of tree files and their MS1 masses
        if file_lookup_table is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            log_helper.debug(__name__,
                             'Preparing file lookup table',
                             comm=self.mpi_comm,
                             root=self.mpi_root)
            if os.path.isfile(self['trees']):
                if self['trees'].endswith('.npy'):
                    file_lookup_table = np.load(self['trees'])
                else:
                    in_treefile = open(self['trees'], 'r')
                    tree_files = [line.rstrip('\n') for line in in_treefile]
                    in_treefile.close()
                    file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(
                        tree_files=tree_files)
            elif os.path.isdir(self['trees']):
                file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(
                    path=self['trees'])

        # Define the common pactolus paramters
        pactolus_parameters = {
            'file_lookup_table': file_lookup_table,
            'ms1_mass_tol': ms1_mass_tol,
            'ms2_mass_tol': ms2_mass_tol,
            'neutralizations': neutralizations,
            'max_depth': max_depth
        }

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([
                    spectrum_indexes,
                ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                log_helper.debug(__name__,
                                 'Preparing parallel execution',
                                 comm=self.mpi_comm,
                                 root=self.mpi_root)
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [
                    0,
                ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.
                    execute_analysis,  # Execute this function
                    task_function_params={
                        'file_lookup_table': file_lookup_table
                    },  # Reuse the file_lookup_table
                    main_data=
                    spectrum_indexes,  # Process the spectra independently
                    split_axes=split_axis,  # Split along axes
                    main_data_param_name='spectrum_indexes',  # data input param
                    root=self.mpi_root,  # The root MPI task
                    schedule=self['schedule'],  # Parallel scheduling scheme
                    comm=self.mpi_comm)  # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                pixel_index = np.zeros((0, 2), dtype='int')
                score = np.zeros((0, ), dtype='f4')
                id_data = np.zeros((0, ), dtype='a100')
                name = np.zeros((0, ), dtype='a100')
                mass = np.zeros((0, ), dtype='f4')
                n_peaks = np.zeros((0, ), dtype='i4')
                n_match = np.zeros((0, ), dtype='i4')

                use_dynamic_schedule = (
                    self['schedule'] ==
                    mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank()
                                            == self.mpi_root
                                            and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    log_helper.debug(__name__, 'Compiling output')
                    # Compile pixel_index
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                    temp_data = [ri[1] for ri in result[0]]
                    # Compile scores
                    if len(temp_data) > 0:
                        score = np.concatenate(tuple(temp_data), axis=0)
                    # Compile id
                    temp_data = [ri[2] for ri in result[0]]
                    if len(temp_data) > 0:
                        id_data = np.concatenate(tuple(temp_data), axis=0)
                    # Compile name
                    temp_data = [ri[3] for ri in result[0]]
                    if len(temp_data) > 0:
                        name = np.concatenate(tuple(temp_data), axis=0)
                    # Compile mass
                    temp_data = [ri[4] for ri in result[0]]
                    if len(temp_data) > 0:
                        mass = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_peaks
                    temp_data = [ri[5] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_peaks = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_match
                    temp_data = [ri[6] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_match = np.concatenate(tuple(temp_data), axis=0)
                    log_helper.log_var(__name__, score=score)
                # Return the compiled output
                return pixel_index, score, id_data, name, mass, n_peaks, n_match

        #############################################################
        # Serial processing of the current data block
        #############################################################
        log_helper.debug(__name__,
                         'Processing spectra',
                         comm=self.mpi_comm,
                         root=self.mpi_root)
        # Initialize the output data structures
        # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        # if len(pixel_index.shape) == 1:
        #    pixel_index = pixel_index[np.newaxis, :]
        hit_matrix = []

        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = int(fpl_peak_arrayindex[spectrum_index, 2])
            stop = int(fpl_peak_arrayindex[(spectrum_index + 1),
                                           2] if spectrum_index <
                       (num_spectra - 1) else fpl_peak_value.size)
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                           str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                log_helper.info(__name__,
                                time_str,
                                comm=self.mpi_comm,
                                root=None)
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 2),
                                          dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = precursor_mz if len(
                precursor_mz) == 1 else precursor_mz[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = score_frag_dag.score_scan_list_against_trees(
                scan_list=[
                    current_peaks_list,
                ],
                ms1_mz=[
                    current_parent_mass,
                ],
                params=pactolus_parameters)
            end_time = time.time()
            execution_time = end_time - start_time
            time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                       str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str((current_hits > 0).sum())
            #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
            #sys.stdout.flush()
            print time_str
            sys.stdout.flush()

            # Save the hits for the current pixel
            hit_matrix.append(current_hits[0, :])

        # Index the results based on the given metabolite database
        score = []
        id_data = []
        name = []
        mass = []
        n_peaks = []
        n_match = []
        pixel_index = []
        if len(metabolite_database) > 0:  # We don't have an empty string
            for current_index, spectrum_index in enumerate(spectrum_indexes):
                non_zero_scores = np.where(hit_matrix[current_index] > 0)
                if non_zero_scores.size > 0:
                    current_hit_table = np.asarray(
                        score_frag_dag.make_pactolus_hit_table(
                            pactolus_results=hit_matrix[current_index],
                            table_file=file_lookup_table,
                            original_db=metabolite_database))
                    for score_index in non_zero_scores:
                        pixel_index.append(fpl_peak_arrayindex[spectrum_index,
                                                               0:2])
                        score.append(current_hit_table['score'][score_index])
                        id_data.append(current_hit_table['id'][score_index])
                        name.append(current_hit_table['name'][score_index])
                        mass.append(current_hit_table['mass'][score_index])
                        n_peaks.append(
                            current_hit_table['n_peaks'][score_index])
                        n_match.append(
                            current_hit_table['n_match'][score_index])
        else:
            pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
            score = np.asarray(hit_matrix)

        # Return the hit_table and the index of the pixel each hit_table applies to
        print "rank : " + str(
            mpi_helper.get_rank()) + " : scores " + str(score)
        sys.stdout.flush()
        return np.asarray(pixel_index), \
               np.asarray(score), \
               np.asarray(id_data), \
               np.asarray(name), \
               np.asarray(mass), \
               np.asarray(n_peaks), \
               np.asarray(n_match)
Beispiel #3
0
    def execute_analysis(self, spectrum_indexes=None, file_lookup_table=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param file_lookup_table: The Pactolus lookup table with the list of tree files and their mass.

        :returns: A series of numpy arrays  with the score data for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image.

            ['pixel_index', 'score', 'id', 'name', 'mass', 'n_peaks', 'n_match']
                * 'pixel_index'  , int,  2D array of pixel indices describing for each spectrum \
                   the (x,y) pixel location in the imag
                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        log_helper.debug(__name__, 'Reading inputs', comm=self.mpi_comm, root=self.mpi_root)
        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']
        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']
        if precursor_mz == -1:
            precursor_mz = self['fpl_data']['precursor_mz'][:]
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        ms1_mass_tol = self['ms1_mass_tolerance']
        ms2_mass_tol = self['ms2_mass_tolerance']
        neutralizations = self['neutralizations']
        max_depth = self['max_depth']

        # Make the numpy array with the list of tree files and their MS1 masses
        if file_lookup_table is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            log_helper.debug(__name__, 'Preparing file lookup table', comm=self.mpi_comm, root=self.mpi_root)
            if os.path.isfile(self['trees']):
                if self['trees'].endswith('.npy'):
                    file_lookup_table = np.load(self['trees'])
                else:
                    in_treefile = open(self['trees'], 'r')
                    tree_files = [line.rstrip('\n') for line in in_treefile]
                    in_treefile.close()
                    file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(tree_files=tree_files)
            elif os.path.isdir(self['trees']):
                file_lookup_table = score_frag_dag.make_file_lookup_table_by_MS1_mass(path=self['trees'])

        # Define the common pactolus paramters
        pactolus_parameters = {'file_lookup_table': file_lookup_table,
                               'ms1_mass_tol': ms1_mass_tol,
                               'ms2_mass_tol': ms2_mass_tol,
                               'neutralizations': neutralizations,
                               'max_depth': max_depth}

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([spectrum_indexes, ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                log_helper.debug(__name__, 'Preparing parallel execution', comm=self.mpi_comm, root=self.mpi_root)
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [0, ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.execute_analysis,                    # Execute this function
                    task_function_params={'file_lookup_table': file_lookup_table},  # Reuse the file_lookup_table
                    main_data=spectrum_indexes,                             # Process the spectra independently
                    split_axes=split_axis,                                  # Split along axes
                    main_data_param_name='spectrum_indexes',                # data input param
                    root=self.mpi_root,                                     # The root MPI task
                    schedule=self['schedule'],                              # Parallel scheduling scheme
                    comm=self.mpi_comm)                                     # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                pixel_index = np.zeros((0, 2), dtype='int')
                score = np.zeros((0,), dtype='f4')
                id_data = np.zeros((0,), dtype='a100')
                name = np.zeros((0,), dtype='a100')
                mass = np.zeros((0,), dtype='f4')
                n_peaks = np.zeros((0,), dtype='i4')
                n_match = np.zeros((0,), dtype='i4')

                use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                # elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    log_helper.debug(__name__, 'Compiling output')
                    # Compile pixel_index
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                    temp_data = [ri[1] for ri in result[0]]
                    # Compile scores
                    if len(temp_data) > 0:
                        score = np.concatenate(tuple(temp_data), axis=0)
                    # Compile id
                    temp_data = [ri[2] for ri in result[0]]
                    if len(temp_data) > 0:
                        id_data = np.concatenate(tuple(temp_data), axis=0)
                    # Compile name
                    temp_data = [ri[3] for ri in result[0]]
                    if len(temp_data) > 0:
                        name = np.concatenate(tuple(temp_data), axis=0)
                    # Compile mass
                    temp_data = [ri[4] for ri in result[0]]
                    if len(temp_data) > 0:
                        mass = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_peaks
                    temp_data = [ri[5] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_peaks = np.concatenate(tuple(temp_data), axis=0)
                    # Compile n_match
                    temp_data = [ri[6] for ri in result[0]]
                    if len(temp_data) > 0:
                        n_match = np.concatenate(tuple(temp_data), axis=0)
                    log_helper.log_var(__name__, score=score)
                # Return the compiled output
                return pixel_index, score, id_data, name, mass, n_peaks, n_match

        #############################################################
        # Serial processing of the current data block
        #############################################################
        log_helper.debug(__name__, 'Processing spectra', comm=self.mpi_comm, root=self.mpi_root)
        # Initialize the output data structures
        # pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        # if len(pixel_index.shape) == 1:
        #    pixel_index = pixel_index[np.newaxis, :]
        hit_matrix = []

        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = int(fpl_peak_arrayindex[spectrum_index, 2])
            stop = int(fpl_peak_arrayindex[(spectrum_index+1), 2]
                   if spectrum_index < (num_spectra-1)
                   else fpl_peak_value.size)
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                           str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 2), dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = precursor_mz if len(precursor_mz) == 1 else precursor_mz[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = score_frag_dag.score_scan_list_against_trees(scan_list=[current_peaks_list, ],
                                                                        ms1_mz=[current_parent_mass, ],
                                                                        params=pactolus_parameters)
            end_time = time.time()
            execution_time = end_time - start_time
            time_str = "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + \
                       str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str((current_hits > 0).sum())
            #log_helper.info(__name__, time_str, comm=self.mpi_comm, root=None)
            #sys.stdout.flush()
            print time_str
            sys.stdout.flush()

            # Save the hits for the current pixel
            hit_matrix.append(current_hits[0, :])

        # Index the results based on the given metabolite database
        score = []
        id_data = []
        name = []
        mass = []
        n_peaks = []
        n_match = []
        pixel_index = []
        if len(metabolite_database) > 0:  # We don't have an empty string
            for current_index, spectrum_index in enumerate(spectrum_indexes):
                non_zero_scores = np.where(hit_matrix[current_index] > 0)
                if non_zero_scores.size > 0:
                    current_hit_table = np.asarray(score_frag_dag.make_pactolus_hit_table(
                        pactolus_results=hit_matrix[current_index],
                        table_file=file_lookup_table,
                        original_db=metabolite_database))
                    for score_index in non_zero_scores:
                        pixel_index.append(fpl_peak_arrayindex[spectrum_index, 0:2])
                        score.append(current_hit_table['score'][score_index])
                        id_data.append(current_hit_table['id'][score_index])
                        name.append(current_hit_table['name'][score_index])
                        mass.append(current_hit_table['mass'][score_index])
                        n_peaks.append(current_hit_table['n_peaks'][score_index])
                        n_match.append(current_hit_table['n_match'][score_index])
        else:
            pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
            score = np.asarray(hit_matrix)

        # Return the hit_table and the index of the pixel each hit_table applies to
        print "rank : " + str(mpi_helper.get_rank()) + " : scores " + str(score)
        sys.stdout.flush()
        return np.asarray(pixel_index), \
               np.asarray(score), \
               np.asarray(id_data), \
               np.asarray(name), \
               np.asarray(mass), \
               np.asarray(n_peaks), \
               np.asarray(n_match)
Beispiel #4
0
    def __init__(self, file_params, max_depth, isotope_dict=None):
        """
        :param file_params:
        :return:
        """
        self.input_inchi_file = file_params['input_inchi_file']
        self.output_hdf5_file_base = file_params['output_hdf5_file_base']
        self.output_error_log = file_params['output_error_log']
        self.output_directory = file_params['output_directory']

        # Make output directory if it does not exist
        if not os.path.isdir(self.output_directory):
            try:
                os.mkdir(self.output_directory)
            except OSError:
                # When executed in parallel it is possible that another rank already created the dir
                # in the meantime. We can safely ignore this error.
                if os.path.isdir(self.output_directory):
                    pass
                else:
                    raise

        # Get isotope dictionary (if none was provided)
        if isotope_dict is None:
            self.isotope_dict = get_isotope_dict()
        else:
            self.isotope_dict = isotope_dict

        self.max_depth = max_depth

        # make list of inchis
        inchi_list = []
        with open(self.input_inchi_file, 'r') as inchi_file:
            for line in inchi_file:
                inchi_list.append(line.strip())
            assert inchi_list
        self.inchi_list = inchi_list

        # ensure any pre-existing output logs are overwritten
        with open(self.output_error_log, 'w') as _:
            pass

        # execute in parallel if possible
        if mpi_helper.MPI_AVAILABLE and mpi_helper.get_size() > 1:
            scheduler = mpi_helper.parallel_over_axes(
                task_function=self.grow_tree_from_inchi,
                task_function_params={},
                main_data=np.unique(
                    np.asarray(inchi_list)
                ),  # FIXME Why are there duplicates in the inchi list
                split_axes=[
                    0,
                ],
                main_data_param_name='inchi',
                root=0,
                schedule=mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'],
                comm=mpi_helper.get_comm_world())
            _ = scheduler.run()

        else:
            for inchi in inchi_list:
                self.grow_tree_from_inchi(inchi)

        return
Beispiel #5
0
    def execute_analysis(self, msidata_subblock=None):
        """
        Execute the local peak finder for the given msidata.

        :param msidata_subblock: Optional input parameter used for parallel execution of the
            analysis only. If msidata_subblock is set, then the given subblock will be processed
            in SERIAL instead of processing self['msidata'] in PARALLEL (if available). This
            parameter is strictly optional and intended for internal use only.

        """
        # Make sure needed imports are available
        from omsi.analysis.findpeaks.third_party.findpeaks import findpeaks
        import numpy as np

        # Assign parameters to local variables for convenience
        msidata = self['msidata']
        if msidata_subblock is not None:
            msidata = msidata_subblock
        mzdata = self['mzdata']
        integration_width = self['integration_width']
        peakheight = self['peakheight']
        slwindow = self['slwindow']
        smoothwidth = self['smoothwidth']
        print_status = self['printStatus']
        if print_status:
            import sys

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(self['msidata'].shape) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if msidata_subblock is None:
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = range(
                    len(self['msidata'].shape) -
                    1)  # The axes along which we can split the data
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.
                    execute_analysis,  # Execute this function
                    task_function_params={},  # No added parameters
                    main_data=msidata,  # Process the msidata
                    split_axes=split_axis,  # Split along axes
                    main_data_param_name='msidata_subblock',  # data input param
                    root=self.mpi_root,  # The root MPI task
                    schedule=self['schedule'],  # Parallel schedule
                    comm=self.mpi_comm)  # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # TODO Record runtime information data from the scheduler in our provenance data
                # self.run_info['SCHEDULER_blocks'] = scheduler.blocks
                # self.run_info['SCHEDULER_block_times'] = scheduler.block_times
                # self.run_info['SCHEDULER_run_time'] = scheduler.run_time
                # self.run_info['SCHEDULER_schedule'] = scheduler.schedule

                # Compile the data from the parallel execution
                # Case Table:
                #
                # collect + worker       2
                #           worker       2
                # collect + root         3
                #           root         1
                use_dynamic_schedule = (
                    self['schedule'] ==
                    mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])
                # Case 1: root rank without collect data disabled
                if mpi_helper.get_rank(
                ) == self.mpi_root and not self['collect']:
                    # We did not process any data on the root if DYNAMIC scheduling was used
                    if use_dynamic_schedule:
                        return None, None, None, mzdata
                    # We processed a data block using dynamic scheduling
                    else:
                        return result[0][0]
                # Case 2: Compile the data on the worker
                elif mpi_helper.get_rank(
                ) != self.mpi_root:  # and use_dynamic_schedule:
                    # Compile the results from all processing task (on workers) or from all workers (on the root)
                    peak_mz = np.concatenate(tuple([ri[0]
                                                    for ri in result[0]]),
                                             axis=-1)
                    peak_values = np.concatenate(tuple(
                        [ri[1] for ri in result[0]]),
                                                 axis=-1)
                    if len(
                            result[1]
                    ) > 1:  # Correct indices from the individual runs since they all start at 0
                        peak_arrayindex = np.asarray([[b[0], b[1], 0]
                                                      for b in result[1]])
                        peak_arrayindex[:, 2] = np.cumsum(
                            [0] + [len(ri[0]) for ri in result[0]])[:-1]
                    else:
                        peak_arrayindex = result[0][0][2]
                    mzdata = result[0][0][3]
                    return peak_mz, peak_values, peak_arrayindex, mzdata
                # Case 3: Compile collected data on the root
                elif mpi_helper.get_rank(
                ) == self.mpi_root:  # and use_dynamic_schedule:
                    # Compile the results from all processing task (on workers) or from all workers (on the root)
                    peak_mz = np.concatenate(tuple([ri[0]
                                                    for ri in result[0]]),
                                             axis=-1)
                    peak_values = np.concatenate(tuple(
                        [ri[1] for ri in result[0]]),
                                                 axis=-1)
                    # Dynamic scheduling uses selections of (int,int,slice) while the static
                    # scheduling uses (slice, slice, slice), hence we need to compile the peak_arrayindex
                    # slightly differently depending on the scheduler used
                    if use_dynamic_schedule:
                        peak_arrayindex = np.asarray([[b[0], b[1], 0]
                                                      for b in result[1]])
                        peak_arrayindex[:, 2] = np.cumsum(
                            [0] + [len(ri[0]) for ri in result[0]])[:-1]
                    else:
                        peak_arrayindex = np.concatenate(tuple(
                            [ri[2] for ri in result[0]]),
                                                         axis=0)
                        d = np.cumsum([0] + [len(ri[0]) for ri in result[0]])
                        d2 = np.cumsum([0] + [len(ri[2]) for ri in result[0]])
                        for di in range(len(d2) - 1):
                            peak_arrayindex[d2[di]:d2[di + 1], 2] += d[di]
                    mzdata = result[0][0][3]
                    return peak_mz, peak_values, peak_arrayindex, mzdata

        #############################################################
        # Serial processing of the current data block
        #############################################################
        # Ensure the our MSI dataset has sufficient numbers of dimensions
        if len(msidata.shape) == 1:
            msidata = msidata[:][np.newaxis, np.newaxis, :]
        elif len(msidata.shape) == 2:
            msidata = msidata[:][np.newaxis, :]

        # Determine the data dimensions
        shape_x = msidata.shape[0]
        shape_y = msidata.shape[1]

        peak_mz = []  # The x values for all peaks, stored in a linear array
        peak_values = [
        ]  # The y values for all peaks, stored in a linear array
        # List describing for each pixel the start index where its peaks
        # are stored in the peaks_MZ and peaks_values array
        peak_arrayindex = np.zeros(shape=(shape_x * shape_y, 3), dtype='int64')
        current_index = long(0)
        pixel_index = 0
        for xi in xrange(0, shape_x):
            for yi in xrange(0, shape_y):
                if print_status:
                    sys.stdout.write("[" + str(
                        int(100. * float(pixel_index) /
                            float(shape_x * shape_y))) + "%]" + "\r")
                    sys.stdout.flush()

                # Load the spectrum
                y = msidata[xi, yi, :]
                # Find peaks in the spectrum
                peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow,
                                        peakheight)
                y = peak_finder.smoothListGaussian()
                # from the smoothed spectra subtract a sliding minima
                peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow,
                                        peakheight)
                slmin = [x for x in peak_finder.sliding_window_minimum()]
                y = y - slmin
                # find peaks in the smoothed, background subtracted spectra
                peak_finder = findpeaks(mzdata[:], y, smoothwidth, slwindow,
                                        peakheight)
                [pkmax, pkmin] = peak_finder.peakdet()
                xp = [x[0] for x in pkmax]
                yp = [x[1] for x in pkmax]
                peak_mz = peak_mz + xp
                peak_values = peak_values + yp
                peak_arrayindex[pixel_index, 0] = xi
                peak_arrayindex[pixel_index, 1] = yi
                peak_arrayindex[pixel_index, 2] = current_index
                pixel_index += 1
                current_index += len(yp)

        # Add the analysis results and parameters to the anlaysis data so that it can be accessed and written to file
        # We here convert the single scalars to 1D numpy arrays to ensure consistency. The data write function can
        # handle also a large range of python built_in types by converting them to numpy for storage in HDF5 but
        # to ensure a consitent behavior we convert the values directly here

        # Save the analysis data to the __data_list so that the data can be
        # saved automatically by the omsi HDF5 file API
        return np.asarray(peak_mz), np.asarray(
            peak_values), peak_arrayindex, mzdata[:]
Beispiel #6
0
    def execute_analysis(self, spectrum_indexes=None, compound_list=None):
        """
        Execute the local peak finder for the given msidata.

        :param spectrum_indexes: List with a list of integer indicies of the subset of sepctra
            that should be processed by this MPI task.  If spectrum_indexes is set, then the given
            subblock will be processed in SERIAL instead of processing self['fpl_data'] in PARALLEL
            (if available). This parameter is strictly optional and intended for internal use only
            to facilitate the efficient parallel implementation.

        :param compound_list: List of the compounds from the database file. This parameter is used
            to avoid having to read the compound database on every compute task that calls this function
            when running in parallel.  This  parameter is strictly optional and intended for internal
            use only to facilitate the efficient parallel implementation.

        :returns: A tuple with an array of hit_tables with the scores for each pixel and a 2D array
            of pixel indices describing for each spectrum the (x,y) pixel location in the image. The
            hit_table is an array of (#spectra x #compounds). The hit_table is a structured numpy
            array with the following columns:

                * 'score',  float,  MIDAS score of row
                * 'id',     str,    database ID e.g. 'MetaCyC_7884'
                * 'name',   str,    database name, e.g. 'glycine'
                * 'mass',   float,  mass in Da of IDed compound
                * 'n_peaks', int,   number of peaks in data
                * 'n_match', int,   number of peaks in data matched

        """
        # Assign parameter settings to local variables for convenience
        metabolite_database = self['metabolite_database']
        precursor_type = self['precursor_type']
        parent_mass_windows = self['parent_mass_windows']
        positive_ion_fragment_mass_windows = self['positive_ion_fragment_mass_windows']
        negative_ion_fragment_mass_windows = self['negative_ion_fragment_mass_windows']
        mass_tolerance_parent_ion = self['mass_tolerance_parent_ion']
        mass_tolerance_fragment_ions = self['mass_tolerance_fragment_ions']
        break_rings = self['break_rings']
        fragmentation_depth = self['fragmentation_depth']

        # Calculate the parent_mass
        precursor_mz = self['precursor_mz']              # FIXME  Get the precursor_mz from the MS2 data
        if precursor_mz == -1:
             precursor_mz = self['fpl_data']['precursor_mz'][:]
        default_charge = self['default_charge']          # FIXME  Is this an input or should we get this from file
        proton_mass = 1.00782503207 - 5.4857990946e-4
        parent_mass = precursor_mz - (default_charge * proton_mass)

        # Get the data we need to process
        fpl_data = self['fpl_data']
        fpl_peak_mz = fpl_data['peak_mz']
        fpl_peak_value = fpl_data['peak_value']
        fpl_peak_arrayindex = fpl_data['peak_arrayindex']

        # Get the compound list if we have not read it previously.
        if compound_list is None:
            # TODO: Possible further optimization by reading only on self.mpi_root and then sending the list to all
            compound_list = MIDAS.ReadCompoundFile(metabolite_database)

        # Get the peak_arrayindex with [[x,y, array_offset], ...] values describing the
        # index of the pixel in (x,y) and the offset in the peak_mz and peak_value array
        # where we can find the spectrum that we need to processes
        num_spectra = fpl_peak_arrayindex.shape[0]
        if spectrum_indexes is None:
            # Get the complete peak array index data
            spectrum_indexes = np.arange(0, num_spectra)
            enable_parallel = True
        else:
            if isinstance(spectrum_indexes, int):
                spectrum_indexes = np.asarray([spectrum_indexes, ])
            enable_parallel = False

        #############################################################
        # Parallel execution using MPI
        #############################################################
        # We have more than a single core AND we have multiple spectra to process
        if mpi_helper.get_size() > 1 and len(spectrum_indexes) > 1:
            # We were not asked to process a specific data subblock from a parallel process
            # but we need to initiate the parallel processing.
            if enable_parallel:
                # Setup the parallel processing using mpi_helper.parallel_over_axes
                split_axis = [0, ]
                scheduler = mpi_helper.parallel_over_axes(
                    task_function=self.execute_analysis,                    # Execute this function
                    task_function_params={'compound_list': compound_list},  # Reuse the compound_list
                    main_data=spectrum_indexes,                             # Process the spectra independently
                    split_axes=split_axis,                                  # Split along axes
                    main_data_param_name='spectrum_indexes',                # data input param
                    root=self.mpi_root,                                     # The root MPI task
                    schedule=self['schedule'],                              # Parallel scheduling scheme
                    comm=self.mpi_comm)                                     # MPI communicator
                # Execute the analysis in parallel
                result = scheduler.run()
                # Collect the output data to the root rank if requested
                if self['collect']:
                    result = scheduler.collect_data()

                # Compile the data from the parallel execution
                hit_table = np.zeros((0, 0), dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE)  # initialize hit_table as empty
                pixel_index = np.zeros((0, 2), dtype='int')
                use_dynamic_schedule = (self['schedule'] == mpi_helper.parallel_over_axes.SCHEDULES['DYNAMIC'])

                # TODO NEED to update since collect now returns a single list not a list of lists
                if not self['collect'] and (mpi_helper.get_rank() == self.mpi_root and use_dynamic_schedule):
                    # We did not process any data on the root process when using dynamic scheduling
                    # and we did not collect the data to the root either
                    pass
                #elif self['collect'] and mpi_helper.get_rank() == self.mpi_root:
                #    temp_data = [ri[0] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                #    temp_data = [ri[1] for rt in result[0] for ri in rt]
                #    if len(temp_data) > 0:
                #        pixel_index = np.concatenate(tuple(temp_data), axis=0) # axis=-1
                else:
                    temp_data = [ri[0] for ri in result[0]]
                    if len(temp_data) > 0:
                        hit_table = np.concatenate(tuple(temp_data), axis=-1)
                    temp_data = [ri[1] for ri in result[0]]
                    if len(temp_data) > 0:
                        pixel_index = np.concatenate(tuple(temp_data), axis=0)
                return hit_table, pixel_index

        #############################################################
        # Serial processing of the current data block
        #############################################################
        # Initialize the output data structures
        pixel_index = fpl_peak_arrayindex[spectrum_indexes, 0:2]
        if len(pixel_index.shape) == 1:
            pixel_index = pixel_index[np.newaxis, :]
        hit_table = None  # FIXME The initalization of the hit_table is only valid if we assume that all spectra have the same precursor m/z, which may not be the case
        # Iterate through all the pixel we were asked to process in serial
        for current_index, spectrum_index in enumerate(spectrum_indexes):
            # Determine the start and stop index for the m/z and intensity data of the current spectrum
            start = fpl_peak_arrayindex[spectrum_index, 2]
            stop = fpl_peak_arrayindex[(spectrum_index+1), 2] \
                if spectrum_index < (num_spectra-1) \
                else fpl_peak_value.size
            spectrum_length = stop - start
            # Skip empty spectra
            if spectrum_length == 0:
                time_str =  "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " Spectrum not scored."
                print time_str
                continue
            # Load the m/z and intensity values for the current spectrum
            current_peaks_list = np.zeros(shape=(spectrum_length, 3), dtype=float)
            current_peaks_list[:, 0] = fpl_peak_mz[start:stop]
            current_peaks_list[:, 1] = fpl_peak_value[start:stop]

            # Get the parent mass
            current_parent_mass = parent_mass if len(parent_mass) == 1 else parent_mass[spectrum_index]

            start_time = time.time()
            # Call MIDAS to score the current spectrum against all compounds in the database
            current_hits = MIDAS.scoring_C.score_main(
                Compound_list=compound_list,
                bBreakRing=break_rings,
                dCurrentPrecursor_type=precursor_type,
                dCurrentParentMass=current_parent_mass,
                current_peaks_list=current_peaks_list,
                iParentMassWindow_list=parent_mass_windows,
                dMass_Tolerance_Parent_Ion=mass_tolerance_parent_ion,
                dMass_Tolerance_Fragment_Ions=mass_tolerance_fragment_ions,
                iFragmentation_Depth=fragmentation_depth,
                iPositive_Ion_Fragment_Mass_Windows_list=positive_ion_fragment_mass_windows,
                iNegative_Ion_Fragment_Mass_Windows_list=negative_ion_fragment_mass_windows,
                top_n=None)

            end_time = time.time()
            execution_time = end_time - start_time
            time_str =  "rank : " + str(mpi_helper.get_rank()) + " : pixel_index : " + str(fpl_peak_arrayindex[spectrum_index, 0:2]) + " : time in s : " + str(execution_time)
            time_str += " : num hits : " + str(current_hits.shape[0])
            print time_str
            sys.stdout.flush()

            # Initialize the hit_table if necessary
            if hit_table is None:
                # If our compound database does not contain any related compounds then just finish
                if current_hits.shape[0] == 0:
                    # Initialize the results as empty and finish as there is nothing to do
                    hit_table = np.zeros(shape=(pixel_index.shape[0], 0),
                                         dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE)  # FIXME the number of hits may be different for different spectra if we have varying precursor m/z
                    continue
                # If our compound database contains at least one relevant compound then check all spectra
                else:
                    # Create the data structure to store all results
                    hit_table = np.zeros(shape=(pixel_index.shape[0], current_hits.shape[0]),
                                         dtype=current_hits.dtype)  # FIXME the number of hits may be different for different spectra if we have varying precursor m/z
            # Save the hits for the current pixel
            hit_table[current_index] = current_hits

        if hit_table is None:
            hit_table = np.zeros(shape=(pixel_index.shape[0], 0),
                                 dtype=MIDAS.scoring_C.HIT_TABLE_DTYPE)

        # Return the hit_table and the index of the pixel each hit_table applies to
        return hit_table, pixel_index