def get_chemicals(mzML_file, mz_tol, min_ms1_intensity, start_rt, stop_rt, min_length=1): ''' Extract ROI from an mzML file and turn them into UnknownChemical objects :param mzML_file: input mzML file :param mz_tol: mz tolerance for ROI extraction :param min_ms1_intensity: ROI will only be kept if it has one point above this threshold :param start_rt: start RT to extract ROI :param stop_rt: end RT to extract ROI :return: a list of UnknownChemical objects ''' min_intensity = 0 roi_params = RoiParams(mz_tol=mz_tol, min_length=min_length, min_intensity=min_intensity, start_rt=start_rt, stop_rt=stop_rt) good_roi = make_roi(mzML_file, roi_params) # keep ROI that have at least one point above the minimum to fragment threshold keep = [] for roi in good_roi: if np.count_nonzero( np.array(roi.intensity_list) > min_ms1_intensity) > 0: keep.append(roi) ps = None # old_unused_experimental rtcc = RoiToChemicalCreator(ps, keep) chemicals = np.array(rtcc.chemicals) return chemicals
def get_rois(mzml, min_roi_length, mzml2chems_dict=QCB_MZML2CHEMS_DICT): roi_params = RoiParams(mz_tol=mzml2chems_dict['mz_tol'], mz_units=mzml2chems_dict['mz_units'], min_length=min_roi_length, min_intensity=mzml2chems_dict['min_intensity'], start_rt=mzml2chems_dict['start_rt'], stop_rt=mzml2chems_dict['stop_rt']) good_roi = make_roi(mzml, roi_params) return good_roi
def mzml2chems(mzml_file, ps, param_dict=QCB_MZML2CHEMS_DICT, output_dir = True, n_peaks=1): good_roi, junk = make_roi(mzml_file, mz_tol=param_dict['mz_tol'], mz_units=param_dict['mz_units'], min_length=param_dict['min_length'], min_intensity=param_dict['min_intensity'], start_rt=param_dict['start_rt'], stop_rt=param_dict['stop_rt']) all_roi = good_roi + junk keep = [] for roi in all_roi: if np.count_nonzero(np.array(roi.intensity_list) > param_dict['min_ms1_intensity']) > 0: keep.append(roi) all_roi = keep rtcc = RoiToChemicalCreator(ps, all_roi, n_peaks) dataset = rtcc.chemicals if output_dir is True: dataset_name = os.path.splitext(mzml_file)[0] + '.p' save_obj(dataset, dataset_name) return dataset
def _get_distributions(self): mzml_file_object = MZMLFile(str(self.mzml_file_name)) rt_bins = {} # mz_bins = {} for scan in mzml_file_object.scans: if not scan.ms_level == 1: continue mz, i = zip(*scan.peaks) total_intensity = sum(i) rt = scan.rt_in_seconds if rt < self.min_rt or rt > self.max_rt: continue rt_bin = int(rt) if rt_bin not in rt_bins: rt_bins[rt_bin] = total_intensity else: rt_bins[rt_bin] += total_intensity total_intensity = sum(rt_bins.values()) self.rt_bins = [(k, k + 1) for k in rt_bins.keys()] self.rt_probs = [v / total_intensity for v in rt_bins.values()] good = make_roi(str(self.mzml_file_name), self.roi_params) log_roi_intensities = [np.log(max(r.intensity_list)) for r in good] log_roi_intensities = filter( lambda x: self.min_log_intensity <= x <= self.max_log_intensity, log_roi_intensities ) log_roi_intensities = list(log_roi_intensities) hist, bin_edges = np.histogram(log_roi_intensities, bins=self.n_intensity_bins) total_i = hist.sum() hist = [h / total_i for h in hist] self.intensity_bins = [(b, bin_edges[i + 1]) for i, b in enumerate(bin_edges[:-1])] self.intensity_probs = [h for h in hist]
def topn_processor(): pathlist = [] base_dir = 'documents/simple_ms1/example_data' # base_dir = 'example_data' mzml_path = os.path.join(base_dir, 'beers', 'fragmentation', 'mzML') file_name = 'Beer_multibeers_1_T10_POS.mzML' experiment_name = 'mzml_compare' experiment_out_dir = os.path.join(base_dir, 'results', experiment_name) min_rt = 0 max_rt = 1441 kde_min_ms1_intensity = 0 # min intensity to be selected for kdes kde_min_ms2_intensity = 0 roi_mz_tol = 10 roi_min_length = 1 roi_min_intensity = 0 roi_start_rt = min_rt roi_stop_rt = max_rt isolation_width = 1 # the (full) isolation width in Dalton around a selected precursor m/z ionisation_mode = POSITIVE N = 10 rt_tol = 15 mz_tol = 10 min_ms1_intensity = 1.75E5 # minimum ms1 intensity to fragment mzml_filename = 'simulated.mzML' mzml_out = os.path.join(experiment_out_dir, mzml_filename) pathlist.append(mzml_out) print('#' * 10, 'Train densities') ds = DataSource() ds.load_data(mzml_path, file_name=file_name) bandwidth_mz_intensity_rt = 1.0 bandwidth_n_peaks = 1.0 ps = get_spectral_feature_database(ds, file_name, kde_min_ms1_intensity, kde_min_ms2_intensity, min_rt, max_rt, bandwidth_mz_intensity_rt, bandwidth_n_peaks) print('#' * 10, 'Extract all ROIs') mzml_file = os.path.join(mzml_path, file_name) good_roi, junk = make_roi(mzml_file, mz_tol=roi_mz_tol, mz_units='ppm', min_length=roi_min_length, min_intensity=roi_min_intensity, start_rt=roi_start_rt, stop_rt=roi_stop_rt) all_roi = good_roi + junk print('#' * 10, len(all_roi)) keep = [] for roi in all_roi: if np.count_nonzero( np.array(roi.intensity_list) > min_ms1_intensity) > 0: keep.append(roi) all_roi = keep set_log_level_debug() rtcc = RoiToChemicalCreator(ps, all_roi) data = rtcc.chemicals save_obj(data, os.path.join(experiment_out_dir, 'dataset.p')) set_log_level_warning() pbar = True mass_spec = IndependentMassSpectrometer(ionisation_mode, data, ps) controller = TopNController(ionisation_mode, N, isolation_width, mz_tol, rt_tol, min_ms1_intensity) # create an environment to run both the mass spec and controller env = Environment(mass_spec, controller, min_rt, max_rt, progress_bar=True) # set the log level to WARNING so we don't see too many messages when environment is running set_log_level_warning() # run the simulation env.run() set_log_level_debug() env.write_mzML(experiment_out_dir, mzml_filename) print('#' * 10, 'Compare Results') matplotlib.use('agg') simulated_input_file = mzml_out simulated_mzs, simulated_rts, simulated_intensities, simulated_cumsum_ms1, simulated_cumsum_ms2 = count_stuff( simulated_input_file, min_rt, max_rt) real_input_file = mzml_file real_mzs, real_rts, real_intensities, real_cumsum_ms1, real_cumsum_ms2 = count_stuff( real_input_file, min_rt, max_rt) plt.rcParams.update({'font.size': 14}) out_file = os.path.join(base_dir, 'results', 'topN_num_scans.png') pathlist.append(out_file) plot_num_scans(real_cumsum_ms1, real_cumsum_ms2, simulated_cumsum_ms1, simulated_cumsum_ms2, out_file) mz_tol = None # in ppm. if None, then 2 decimal places is used for matching the m/z rt_tol = 5 # seconds matches = match_peaklist(real_mzs, real_rts, real_intensities, simulated_mzs, simulated_rts, simulated_intensities, mz_tol, rt_tol) check_found_matches(matches, 'Real', 'Simulated') mz_tol = None rt_tol = 10 matches = match_peaklist(real_mzs, real_rts, real_intensities, simulated_mzs, simulated_rts, simulated_intensities, mz_tol, rt_tol) check_found_matches(matches, 'Real', 'Simulated') mz_tol = None rt_tol = 15 matches = match_peaklist(real_mzs, real_rts, real_intensities, simulated_mzs, simulated_rts, simulated_intensities, mz_tol, rt_tol) check_found_matches(matches, 'Real', 'Simulated') unmatched_intensities = [] matched_intensities = [] for key, value in list(matches.items()): intensity = key[2] if value is None: unmatched_intensities.append(intensity) else: matched_intensities.append(intensity) plt.rcParams.update({'font.size': 18}) out_file = os.path.join(base_dir, 'results', 'topN_matched_intensities.png') plot_matched_intensities(matched_intensities, unmatched_intensities, out_file) pathlist.append(out_file) out_file = os.path.join(base_dir, 'results', 'topN_matched_precursors.png') plot_matched_precursors(matches, 50, 1000, 180, 1260, out_file) pathlist.append(out_file) return pathlist
def varying_topn_processor(): pathlist = [] base_dir = 'documents/simple_ms1/example_data' mzml_path = os.path.join(base_dir, 'beers', 'fragmentation', 'mzML') file_name = 'Beer_multibeers_1_T10_POS.mzML' experiment_name = 'beer1pos' url_experiment_out_dir = os.path.join(base_dir, 'results', experiment_name, 'mzML') experiment_out_dir = os.path.abspath( os.path.join(base_dir, 'results', experiment_name, 'mzML')) min_rt = 3 * 60 # start time when compounds begin to elute in the mzML file max_rt = 21 * 60 kde_min_ms1_intensity = 0 # min intensity to be selected for kdes kde_min_ms2_intensity = 0 roi_mz_tol = 10 roi_min_length = 1 roi_min_intensity = 0 roi_start_rt = min_rt roi_stop_rt = max_rt isolation_window = 1 # the isolation window in Dalton around a selected precursor ion ionisation_mode = POSITIVE N = 10 rt_tol = 15 mz_tol = 10 min_ms1_intensity = 1.75E5 # minimum ms1 intensity to fragment mzml_out = os.path.join(experiment_out_dir, 'simulated.mzML') print('#' * 10, 'Train densities') ds = DataSource() ds.load_data(mzml_path, file_name=file_name) bandwidth_mz_intensity_rt = 1.0 bandwidth_n_peaks = 1.0 ps = get_spectral_feature_database(ds, file_name, kde_min_ms1_intensity, kde_min_ms2_intensity, min_rt, max_rt, bandwidth_mz_intensity_rt, bandwidth_n_peaks) print('#' * 10, 'Extract all ROIs') mzml_file = os.path.join(mzml_path, file_name) good_roi, junk = make_roi(mzml_file, mz_tol=roi_mz_tol, mz_units='ppm', min_length=roi_min_length, min_intensity=roi_min_intensity, start_rt=roi_start_rt, stop_rt=roi_stop_rt) all_roi = good_roi + junk print('#' * 10, 'How many singleton and non-singleton ROIs =>', len([roi for roi in all_roi if roi.n == 1])) keep = [] for roi in all_roi: if np.count_nonzero( np.array(roi.intensity_list) > min_ms1_intensity) > 0: keep.append(roi) all_roi = keep set_log_level_debug() rtcc = RoiToChemicalCreator(ps, all_roi) data = rtcc.chemicals save_obj(data, os.path.join(experiment_out_dir, 'dataset.p')) print('#' * 10, 'Run Top-N Controller') set_log_level_warning() pbar = False # turn off progress bar Ns = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100 ] rt_tols = [15] params = get_params(experiment_name, Ns, rt_tols, mz_tol, isolation_window, ionisation_mode, data, ps, min_ms1_intensity, min_rt, max_rt, experiment_out_dir, pbar) run_serial_experiment(params) print('#' * 10, 'Analyse Results') min_ms1_intensity = 0 rt_range = [(min_rt, max_rt)] mz_range = [(0, math.inf)] results_dir = os.path.join(base_dir, 'results', 'ground_truth', 'mzML') csv_file = os.path.join(results_dir, 'extracted_peaks_ms1.csv') P_peaks_df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range) csv_file = os.path.join(experiment_out_dir, 'extracted_peaks_ms1.csv') Q_peaks_df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range) fullscan_filename = 'Beer_multibeers_1_fullscan1.mzML' matching_mz_tol = 10 # ppm matching_rt_tol = 30 # seconds results = [] for N in Ns: for rt_tol in rt_tols: # load chemicals and check for matching chemicals = load_obj(os.path.join(experiment_out_dir, 'dataset.p')) fragfile_filename = 'experiment_%s_N_%d_rttol_%d.mzML' % ( experiment_name, N, rt_tol) # load controller and compute performance controller = load_controller(experiment_out_dir, experiment_name, N, rt_tol) mytemp = os.path.join(url_experiment_out_dir, fragfile_filename) pathlist.append(mytemp) if controller is not None: tp, fp, fn, prec, rec, f1 = compute_performance_scenario_2( controller, chemicals, min_ms1_intensity, fullscan_filename, fragfile_filename, P_peaks_df, Q_peaks_df, matching_mz_tol, matching_rt_tol) print( '%s N=%d rt_tol=%d tp=%d fp=%d fn=%d prec=%.3f rec=%.3f f1=%.3f' % (experiment_name, N, rt_tol, tp, fp, fn, prec, rec, f1)) res = (experiment_name, N, rt_tol, tp, fp, fn, prec, rec, f1) results.append(res) result_df = pd.DataFrame(results, columns=[ 'experiment', 'N', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1' ]) plt.figure(figsize=(12, 6)) ax = sns.lineplot(x='N', y='Prec', hue='experiment', legend='brief', data=result_df) plt.title('Top-N Precision') for l in ax.lines: plt.setp(l, linewidth=5) plt.ylabel('Precision') plt.xlabel(r'Top-$N$') plt.legend(prop={'size': 20}) plt.tight_layout() fig_out = os.path.join(experiment_out_dir, 'topN_precision.png') plt.savefig(fig_out, dpi=300) plt.figure(figsize=(12, 6)) ax = sns.lineplot(x='N', y='Rec', hue='experiment', legend='brief', data=result_df) plt.title('Top-N Recall') for l in ax.lines: plt.setp(l, linewidth=5) plt.ylabel('Recall') plt.xlabel(r'Top-$N$') plt.legend(prop={'size': 20}) plt.tight_layout() fig_out = os.path.join(experiment_out_dir, 'topN_recall.png') plt.figure(figsize=(12, 6)) ax = sns.lineplot(x='N', y='F1', hue='experiment', legend='brief', data=result_df) plt.title('Top-N F1') for l in ax.lines: plt.setp(l, linewidth=5) plt.ylabel(r'$F_{1}\;score$') plt.xlabel(r'Top-$N$') plt.legend(prop={'size': 20}) plt.tight_layout() fig_out = os.path.join(experiment_out_dir, 'topN_f1.png') plt.savefig(fig_out, dpi=300) return pathlist
def _extract_rois(self): good = make_roi(str(self.mzml_file_name), self.roi_params) logger.debug("Extracted {} good ROIs from {}".format( len(good), self.mzml_file_name)) return good