def extract_roi(file_names, out_dir, pattern, mzml_path, ps, roi_mz_tol=10, roi_min_length=2, roi_min_intensity=1.75E5, roi_start_rt=0, roi_stop_rt=1440): for i in range(len(file_names)): # for all mzML files in file_names # extract ROI mzml_file = os.path.join(mzml_path, file_names[i]) good_roi, junk = make_roi(mzml_file, mz_tol=roi_mz_tol, mz_units='ppm', min_length=roi_min_length, min_intensity=roi_min_intensity, start_rt=roi_start_rt, stop_rt=roi_stop_rt) all_roi = good_roi # turn ROI to chemicals rtcc = RoiToChemicalCreator(ps, all_roi) data = rtcc.chemicals # save extracted chemicals basename = os.path.basename(file_names[i]) out_name = pattern % int(basename.split('_')[2]) save_obj(data, os.path.join(out_dir, out_name))
def get_spectral_feature_database(ds, filename, min_ms1_intensity, min_ms2_intensity, min_rt, max_rt, bandwidth_mz_intensity_rt, bandwidth_n_peaks, out_file=None): """ Generate spectral feature database on the .mzML files that have been loaded into the DataSource :param ds: the `DataSource` object that contains loaded .mzML files. :param filename: a particular .mzML file to be used. If None then all loaded files in `ds` will be used. :param min_ms1_intensity: minimum MS1 intensity to include a data point to train the KDEs. :param min_ms2_intensity: minimum MS2 intensity to include a data point to train the KDEs. :param min_rt: minimum RT to include a data point to train the KDEs. :param max_rt: maximum RT to include a data point to train the KDEs. :param bandwidth_mz_intensity_rt: the bandwidth of the kernel to train the KDEs for (mz, RT, intensity) values. :param bandwidth_n_peaks: the bandwidth of the kernel to train the KDEs for the number of peaks per scan. :param out_file: the resulting output file to store the trained KDEs (in form of `PeakSampler` object). :return: a PeakSampler object that can be used to draw samples for simulation. """ ps = PeakSampler(ds, min_rt, max_rt, min_ms1_intensity, min_ms2_intensity, filename, False, bandwidth_mz_intensity_rt, bandwidth_n_peaks) if out_file is not None: save_obj(ps, out_file) return ps
def __init__(self, sequence_manager, controller_method, mass_spec_param_dict, dataset_file, variable_params_dict, base_params_dict, mzml_file=None, roi_params=RoiParams(min_intensity=10, min_length=5), ps=None, parallel=True): self.sequence_manager = sequence_manager self.parallel = parallel self.controller_method = controller_method self.mass_spec_param_dict = mass_spec_param_dict self.dataset_file = dataset_file self.mzml_file = mzml_file if self.dataset_file is None: cm = ChemicalMixtureFromMZML(self.mzml_file, roi_params=roi_params) dataset = cm.sample(None, 2) dataset_name = os.path.join(sequence_manager.base_dir, Path(mzml_file).stem + '.p') save_obj(dataset, dataset_name) self.dataset_file = dataset_name if self.sequence_manager.ms1_picked_peaks_file is None and len( self.sequence_manager.evaluation_methods) > 0: self.sequence_manager.ms1_picked_peaks_file = self.sequence_manager.pick_peaks( self.mzml_file, None, 1) self.variable_params_dict = variable_params_dict self.base_params_dict = base_params_dict sequence_manager.controller_schedule = self._generate_controller_schedule( ) super().__init__(sequence_manager, self.parallel, ps=ps)
def run_experiment(self, idx): controller_name = self.controller_schedule['Sample ID'][idx] mzml_files = glob.glob(os.path.join(self.base_dir, '*.mzML')) if controller_name + '.mzML' not in [ os.path.basename(file) for file in mzml_files ]: controller, ms_params = super().run_experiment(idx) # load data and set up MS logger.info(self.controller_schedule.iloc[[idx]].to_dict()) method = self.controller_schedule['Controller Method'][idx] dataset = self.controller_schedule['Dataset'][idx] if method is not None and dataset is not None: dataset = load_obj(self.controller_schedule['Dataset'][idx]) mass_spec = IndependentMassSpectrometer( ms_params['ionisation_mode'], dataset) # Run sample env = Environment(mass_spec, controller, self.rt_range[0][0], self.rt_range[0][1], progress_bar=self.progress_bar) env.run() env.write_mzML(self.base_dir, controller_name + '.mzML') if self.write_env: save_obj( controller, os.path.join(self.base_dir, controller_name + '.p')) else: logger.info('Experiment already completed. Skipping...') mzml_file = os.path.join(self.base_dir, controller_name + '.mzML') return mzml_file, controller_name
def write_debug_info(self, out_dir, out_file, debug_info): """ Writes debugging information to output file :param out_dir: output directory :param out_file: output filename :return: None """ filename = self._get_out_file(out_dir, out_file + '.p') logger.debug('Writing debug info to %s' % filename) save_obj(debug_info, filename) logger.debug('debug info successfully written!')
def add_dataset_files(self, sequence_manager, mzml_file_list): for i in range(len(sequence_manager.controller_schedule['Dataset'])): if mzml_file_list[sequence_manager.schedule_idx[i]] is not None: mzml_file = mzml_file_list[sequence_manager.schedule_idx[i]] cm = ChemicalMixtureFromMZML(mzml_file, roi_params=self.roi_params) dataset = cm.sample(None, 2) dataset_name = os.path.join( sequence_manager.base_dir, Path(mzml_file_list[sequence_manager.schedule_idx[i]]).stem + '.p') save_obj(dataset, dataset_name) sequence_manager.controller_schedule['Dataset'][ i] = dataset_name return sequence_manager
def run_experiment(param): ''' Runs a Top-N experiment :param param: the experimental parameters :return: the analysis name that has been successfully ran ''' analysis_name = param['analysis_name'] mzml_out = param['mzml_out'] pickle_out = param['pickle_out'] N = param['N'] rt_tol = param['rt_tol'] if os.path.isfile(mzml_out) and os.path.isfile(pickle_out): logger.debug('Skipping %s' % (analysis_name)) else: logger.debug('Processing %s' % (analysis_name)) peak_sampler = param['peak_sampler'] if peak_sampler is None: # extract density from the fragmenatation file mzml_path = param['mzml_path'] fragfiles = param['fragfiles'] fragfile = fragfiles[( N, rt_tol, )] min_rt = param['min_rt'] max_rt = param['max_rt'] peak_sampler = get_peak_sampler(mzml_path, fragfile, min_rt, max_rt) mass_spec = IndependentMassSpectrometer(param['ionisation_mode'], param['data']) controller = TopNController(param['ionisation_mode'], param['N'], param['isolation_width'], param['mz_tol'], param['rt_tol'], param['min_ms1_intensity']) # create an environment to run both the mass spec and controller env = Environment(mass_spec, controller, param['min_rt'], param['max_rt'], progress_bar=param['pbar']) set_log_level_warning() env.run() set_log_level_debug() env.write_mzML(None, mzml_out) save_obj(controller, pickle_out) return analysis_name
def extract_roi(file_names, out_dir, pattern, mzml_path, param_dict=DEFAULT_MZML_CHEMICAL_CREATOR_PARAMS): """ Extract ROI for all mzML files listed in file_names, and turn them into Chemical objecs :param file_names: a list of mzML file names :param out_dir: output directory to store pickled chemicals. If None, then the current directory is used :param pattern: pattern for output file :param mzml_path: input directory containing all the mzML files in file_names. :param ps: a peak sampler object :param param_dict: dictionary of parameters :return: a list of extracted Chemicals, one for each mzML file """ # extract ROI for all mzML files in file_names datasets = [] for i in range(len(file_names)): # if mzml_path is provided, use that as the front part of filename if mzml_path is not None: mzml_file = os.path.join(mzml_path, file_names[i]) else: mzml_file = file_names[i] rp = RoiParams(**param_dict) cm = ChemicalMixtureFromMZML(mzml_file, roi_params=rp) dataset = cm.sample(None, 2) datasets.append(dataset) # save extracted chemicals if out_dir is None: # if no out_dir provided, then same in the same location # as the mzML file dataset_name = os.path.splitext(mzml_file)[0] + '.p' save_obj(dataset, dataset_name) else: # else save the chemicals in our_dir, using pattern as the filename basename = os.path.basename(file_names[i]) out_name = pattern % int(basename.split('_')[2]) save_obj(dataset, os.path.join(out_dir, out_name)) return datasets
def __init__(self, original_dataset, n_samples, classes, intensity_noise_sd, change_probabilities, change_differences_means, change_differences_sds, dropout_probabilities=None, dropout_numbers=None, experimental_classes=None, experimental_probabilitities=None, experimental_sds=None, save_location=None): self.original_dataset = original_dataset self.n_samples = n_samples self.classes = classes self.intensity_noise_sd = intensity_noise_sd self.change_probabilities = change_probabilities self.change_differences_means = change_differences_means self.change_differences_sds = change_differences_sds self.dropout_probabilities = dropout_probabilities self.dropout_numbers = dropout_numbers self.experimental_classes = experimental_classes self.experimental_probabilitities = experimental_probabilitities self.experimental_sds = experimental_sds self.save_location = save_location self.sample_classes = [] for index_classes in range(len(self.classes)): self.sample_classes.extend([self.classes[index_classes] for i in range(n_samples[index_classes])]) self.chemical_statuses = self._get_chemical_statuses() self.chemical_differences_from_class1 = self._get_chemical_differences_from_class1() if self.experimental_classes is not None: self.sample_experimental_statuses = self._get_experimental_statuses() self.experimental_effects = self._get_experimental_effects() self.logger.debug("Classes, Statuses and Differences defined.") self.samples = [] for index_sample in range(sum(self.n_samples)): self.logger.debug("Dataset {} of {} created.".format(index_sample + 1, sum(self.n_samples))) new_sample = copy.deepcopy(self.original_dataset) which_class = np.where(np.array(self.classes) == self.sample_classes[index_sample]) for index_chemical in range(len(new_sample)): if not np.array(self.chemical_statuses)[which_class][0][index_chemical] == "missing": original_intensity = new_sample[index_chemical].max_intensity intensity = self._get_intensity(original_intensity, which_class, index_chemical) adjusted_intensity = self._get_experimental_factor_effect(intensity, index_sample, index_chemical) noisy_adjusted_intensity = self._get_noisy_intensity(adjusted_intensity) new_sample[index_chemical].max_intensity = noisy_adjusted_intensity.tolist()[0] chemicals_to_keep = np.where((np.array(self.chemical_statuses)[which_class][0]) != "missing") new_sample = np.array(new_sample)[chemicals_to_keep].tolist() if self.save_location is not None: save_obj(new_sample, Path(self.save_location, 'sample_%d.p' % index_sample)) self.samples.append(new_sample)
def save(self, outname): data_to_save = { 'scans': self.controller.scans, # etc } save_obj(data_to_save, outname)