Beispiel #1
0
def extract_roi(file_names,
                out_dir,
                pattern,
                mzml_path,
                ps,
                roi_mz_tol=10,
                roi_min_length=2,
                roi_min_intensity=1.75E5,
                roi_start_rt=0,
                roi_stop_rt=1440):
    for i in range(len(file_names)):  # for all mzML files in file_names
        # extract ROI
        mzml_file = os.path.join(mzml_path, file_names[i])
        good_roi, junk = make_roi(mzml_file,
                                  mz_tol=roi_mz_tol,
                                  mz_units='ppm',
                                  min_length=roi_min_length,
                                  min_intensity=roi_min_intensity,
                                  start_rt=roi_start_rt,
                                  stop_rt=roi_stop_rt)
        all_roi = good_roi

        # turn ROI to chemicals
        rtcc = RoiToChemicalCreator(ps, all_roi)
        data = rtcc.chemicals

        # save extracted chemicals
        basename = os.path.basename(file_names[i])
        out_name = pattern % int(basename.split('_')[2])
        save_obj(data, os.path.join(out_dir, out_name))
Beispiel #2
0
def get_spectral_feature_database(ds,
                                  filename,
                                  min_ms1_intensity,
                                  min_ms2_intensity,
                                  min_rt,
                                  max_rt,
                                  bandwidth_mz_intensity_rt,
                                  bandwidth_n_peaks,
                                  out_file=None):
    """
    Generate spectral feature database on the .mzML files that have been loaded into the DataSource
    :param ds: the `DataSource` object that contains loaded .mzML files.
    :param filename: a particular .mzML file to be used. If None then all loaded files in `ds` will be used.
    :param min_ms1_intensity: minimum MS1 intensity to include a data point to train the KDEs.
    :param min_ms2_intensity: minimum MS2 intensity to include a data point to train the KDEs.
    :param min_rt: minimum RT to include a data point to train the KDEs.
    :param max_rt: maximum RT to include a data point to train the KDEs.
    :param bandwidth_mz_intensity_rt: the bandwidth of the kernel to train the KDEs for (mz, RT, intensity) values.
    :param bandwidth_n_peaks: the bandwidth of the kernel to train the KDEs for the number of peaks per scan.
    :param out_file: the resulting output file to store the trained KDEs (in form of `PeakSampler` object).
    :return: a PeakSampler object that can be used to draw samples for simulation.
    """
    ps = PeakSampler(ds, min_rt, max_rt, min_ms1_intensity, min_ms2_intensity,
                     filename, False, bandwidth_mz_intensity_rt,
                     bandwidth_n_peaks)
    if out_file is not None:
        save_obj(ps, out_file)
    return ps
Beispiel #3
0
    def __init__(self,
                 sequence_manager,
                 controller_method,
                 mass_spec_param_dict,
                 dataset_file,
                 variable_params_dict,
                 base_params_dict,
                 mzml_file=None,
                 roi_params=RoiParams(min_intensity=10, min_length=5),
                 ps=None,
                 parallel=True):

        self.sequence_manager = sequence_manager
        self.parallel = parallel
        self.controller_method = controller_method
        self.mass_spec_param_dict = mass_spec_param_dict
        self.dataset_file = dataset_file
        self.mzml_file = mzml_file
        if self.dataset_file is None:
            cm = ChemicalMixtureFromMZML(self.mzml_file, roi_params=roi_params)
            dataset = cm.sample(None, 2)
            dataset_name = os.path.join(sequence_manager.base_dir,
                                        Path(mzml_file).stem + '.p')
            save_obj(dataset, dataset_name)
            self.dataset_file = dataset_name
            if self.sequence_manager.ms1_picked_peaks_file is None and len(
                    self.sequence_manager.evaluation_methods) > 0:
                self.sequence_manager.ms1_picked_peaks_file = self.sequence_manager.pick_peaks(
                    self.mzml_file, None, 1)
        self.variable_params_dict = variable_params_dict
        self.base_params_dict = base_params_dict
        sequence_manager.controller_schedule = self._generate_controller_schedule(
        )
        super().__init__(sequence_manager, self.parallel, ps=ps)
Beispiel #4
0
 def run_experiment(self, idx):
     controller_name = self.controller_schedule['Sample ID'][idx]
     mzml_files = glob.glob(os.path.join(self.base_dir, '*.mzML'))
     if controller_name + '.mzML' not in [
             os.path.basename(file) for file in mzml_files
     ]:
         controller, ms_params = super().run_experiment(idx)
         # load data and set up MS
         logger.info(self.controller_schedule.iloc[[idx]].to_dict())
         method = self.controller_schedule['Controller Method'][idx]
         dataset = self.controller_schedule['Dataset'][idx]
         if method is not None and dataset is not None:
             dataset = load_obj(self.controller_schedule['Dataset'][idx])
             mass_spec = IndependentMassSpectrometer(
                 ms_params['ionisation_mode'], dataset)
             # Run sample
             env = Environment(mass_spec,
                               controller,
                               self.rt_range[0][0],
                               self.rt_range[0][1],
                               progress_bar=self.progress_bar)
             env.run()
             env.write_mzML(self.base_dir, controller_name + '.mzML')
             if self.write_env:
                 save_obj(
                     controller,
                     os.path.join(self.base_dir, controller_name + '.p'))
     else:
         logger.info('Experiment already completed. Skipping...')
     mzml_file = os.path.join(self.base_dir, controller_name + '.mzML')
     return mzml_file, controller_name
Beispiel #5
0
 def write_debug_info(self, out_dir, out_file, debug_info):
     """
     Writes debugging information to output file
     :param out_dir: output directory
     :param out_file: output filename
     :return: None
     """
     filename = self._get_out_file(out_dir, out_file + '.p')
     logger.debug('Writing debug info to %s' % filename)
     save_obj(debug_info, filename)
     logger.debug('debug info successfully written!')
Beispiel #6
0
 def add_dataset_files(self, sequence_manager, mzml_file_list):
     for i in range(len(sequence_manager.controller_schedule['Dataset'])):
         if mzml_file_list[sequence_manager.schedule_idx[i]] is not None:
             mzml_file = mzml_file_list[sequence_manager.schedule_idx[i]]
             cm = ChemicalMixtureFromMZML(mzml_file,
                                          roi_params=self.roi_params)
             dataset = cm.sample(None, 2)
             dataset_name = os.path.join(
                 sequence_manager.base_dir,
                 Path(mzml_file_list[sequence_manager.schedule_idx[i]]).stem
                 + '.p')
             save_obj(dataset, dataset_name)
             sequence_manager.controller_schedule['Dataset'][
                 i] = dataset_name
     return sequence_manager
Beispiel #7
0
def run_experiment(param):
    '''
    Runs a Top-N experiment
    :param param: the experimental parameters
    :return: the analysis name that has been successfully ran
    '''
    analysis_name = param['analysis_name']
    mzml_out = param['mzml_out']
    pickle_out = param['pickle_out']
    N = param['N']
    rt_tol = param['rt_tol']

    if os.path.isfile(mzml_out) and os.path.isfile(pickle_out):
        logger.debug('Skipping %s' % (analysis_name))
    else:
        logger.debug('Processing %s' % (analysis_name))
        peak_sampler = param['peak_sampler']
        if peak_sampler is None:  # extract density from the fragmenatation file
            mzml_path = param['mzml_path']
            fragfiles = param['fragfiles']
            fragfile = fragfiles[(
                N,
                rt_tol,
            )]
            min_rt = param['min_rt']
            max_rt = param['max_rt']
            peak_sampler = get_peak_sampler(mzml_path, fragfile, min_rt,
                                            max_rt)

        mass_spec = IndependentMassSpectrometer(param['ionisation_mode'],
                                                param['data'])
        controller = TopNController(param['ionisation_mode'], param['N'],
                                    param['isolation_width'], param['mz_tol'],
                                    param['rt_tol'],
                                    param['min_ms1_intensity'])
        # create an environment to run both the mass spec and controller
        env = Environment(mass_spec,
                          controller,
                          param['min_rt'],
                          param['max_rt'],
                          progress_bar=param['pbar'])
        set_log_level_warning()
        env.run()
        set_log_level_debug()
        env.write_mzML(None, mzml_out)
        save_obj(controller, pickle_out)
        return analysis_name
Beispiel #8
0
def extract_roi(file_names,
                out_dir,
                pattern,
                mzml_path,
                param_dict=DEFAULT_MZML_CHEMICAL_CREATOR_PARAMS):
    """
    Extract ROI for all mzML files listed in file_names, and turn them
    into Chemical objecs
    :param file_names: a list of mzML file names
    :param out_dir: output directory to store pickled chemicals. If None,
    then the current directory is used
    :param pattern: pattern for output file
    :param mzml_path: input directory containing all the mzML files in
    file_names.
    :param ps: a peak sampler object
    :param param_dict: dictionary of parameters
    :return: a list of extracted Chemicals, one for each mzML file
    """
    # extract ROI for all mzML files in file_names
    datasets = []
    for i in range(len(file_names)):

        # if mzml_path is provided, use that as the front part of filename
        if mzml_path is not None:
            mzml_file = os.path.join(mzml_path, file_names[i])
        else:
            mzml_file = file_names[i]

        rp = RoiParams(**param_dict)
        cm = ChemicalMixtureFromMZML(mzml_file, roi_params=rp)
        dataset = cm.sample(None, 2)
        datasets.append(dataset)

        # save extracted chemicals
        if out_dir is None:
            # if no out_dir provided, then same in the same location
            # as the mzML file
            dataset_name = os.path.splitext(mzml_file)[0] + '.p'
            save_obj(dataset, dataset_name)
        else:
            # else save the chemicals in our_dir, using pattern as the filename
            basename = os.path.basename(file_names[i])
            out_name = pattern % int(basename.split('_')[2])
            save_obj(dataset, os.path.join(out_dir, out_name))

    return datasets
Beispiel #9
0
    def __init__(self, original_dataset, n_samples, classes, intensity_noise_sd,
                 change_probabilities, change_differences_means, change_differences_sds, dropout_probabilities=None,
                 dropout_numbers=None, experimental_classes=None, experimental_probabilitities=None,
                 experimental_sds=None, save_location=None):
        self.original_dataset = original_dataset
        self.n_samples = n_samples
        self.classes = classes
        self.intensity_noise_sd = intensity_noise_sd
        self.change_probabilities = change_probabilities
        self.change_differences_means = change_differences_means
        self.change_differences_sds = change_differences_sds
        self.dropout_probabilities = dropout_probabilities
        self.dropout_numbers = dropout_numbers
        self.experimental_classes = experimental_classes
        self.experimental_probabilitities = experimental_probabilitities
        self.experimental_sds = experimental_sds
        self.save_location = save_location

        self.sample_classes = []
        for index_classes in range(len(self.classes)):
            self.sample_classes.extend([self.classes[index_classes] for i in range(n_samples[index_classes])])
        self.chemical_statuses = self._get_chemical_statuses()
        self.chemical_differences_from_class1 = self._get_chemical_differences_from_class1()
        if self.experimental_classes is not None:
            self.sample_experimental_statuses = self._get_experimental_statuses()
            self.experimental_effects = self._get_experimental_effects()
        self.logger.debug("Classes, Statuses and Differences defined.")

        self.samples = []
        for index_sample in range(sum(self.n_samples)):
            self.logger.debug("Dataset {} of {} created.".format(index_sample + 1, sum(self.n_samples)))
            new_sample = copy.deepcopy(self.original_dataset)
            which_class = np.where(np.array(self.classes) == self.sample_classes[index_sample])
            for index_chemical in range(len(new_sample)):
                if not np.array(self.chemical_statuses)[which_class][0][index_chemical] == "missing":
                    original_intensity = new_sample[index_chemical].max_intensity
                    intensity = self._get_intensity(original_intensity, which_class, index_chemical)
                    adjusted_intensity = self._get_experimental_factor_effect(intensity, index_sample, index_chemical)
                    noisy_adjusted_intensity = self._get_noisy_intensity(adjusted_intensity)
                    new_sample[index_chemical].max_intensity = noisy_adjusted_intensity.tolist()[0]
            chemicals_to_keep = np.where((np.array(self.chemical_statuses)[which_class][0]) != "missing")
            new_sample = np.array(new_sample)[chemicals_to_keep].tolist()
            if self.save_location is not None:
                save_obj(new_sample, Path(self.save_location, 'sample_%d.p' % index_sample))
            self.samples.append(new_sample)
Beispiel #10
0
 def save(self, outname):
     data_to_save = {
         'scans': self.controller.scans,
         # etc
     }
     save_obj(data_to_save, outname)