Ejemplo n.º 1
0
def get_chemicals(mzML_file,
                  mz_tol,
                  min_ms1_intensity,
                  start_rt,
                  stop_rt,
                  min_length=1):
    '''
    Extract ROI from an mzML file and turn them into UnknownChemical objects
    :param mzML_file: input mzML file
    :param mz_tol: mz tolerance for ROI extraction
    :param min_ms1_intensity: ROI will only be kept if it has one point above this threshold
    :param start_rt: start RT to extract ROI
    :param stop_rt: end RT to extract ROI
    :return: a list of UnknownChemical objects
    '''
    min_intensity = 0
    roi_params = RoiParams(mz_tol=mz_tol,
                           min_length=min_length,
                           min_intensity=min_intensity,
                           start_rt=start_rt,
                           stop_rt=stop_rt)
    good_roi = make_roi(mzML_file, roi_params)

    # keep ROI that have at least one point above the minimum to fragment threshold
    keep = []
    for roi in good_roi:
        if np.count_nonzero(
                np.array(roi.intensity_list) > min_ms1_intensity) > 0:
            keep.append(roi)

    ps = None  # old_unused_experimental
    rtcc = RoiToChemicalCreator(ps, keep)
    chemicals = np.array(rtcc.chemicals)
    return chemicals
Ejemplo n.º 2
0
def get_rois(mzml, min_roi_length, mzml2chems_dict=QCB_MZML2CHEMS_DICT):
    roi_params = RoiParams(mz_tol=mzml2chems_dict['mz_tol'],
                           mz_units=mzml2chems_dict['mz_units'],
                           min_length=min_roi_length,
                           min_intensity=mzml2chems_dict['min_intensity'],
                           start_rt=mzml2chems_dict['start_rt'],
                           stop_rt=mzml2chems_dict['stop_rt'])
    good_roi = make_roi(mzml, roi_params)
    return good_roi
Ejemplo n.º 3
0
Archivo: BOMAS.py Proyecto: 5ggg/vimms
def mzml2chems(mzml_file, ps, param_dict=QCB_MZML2CHEMS_DICT, output_dir = True, n_peaks=1):
    good_roi, junk = make_roi(mzml_file, mz_tol=param_dict['mz_tol'], mz_units=param_dict['mz_units'],
                              min_length=param_dict['min_length'], min_intensity=param_dict['min_intensity'],
                              start_rt=param_dict['start_rt'], stop_rt=param_dict['stop_rt'])
    all_roi = good_roi + junk
    keep = []
    for roi in all_roi:
        if np.count_nonzero(np.array(roi.intensity_list) > param_dict['min_ms1_intensity']) > 0:
            keep.append(roi)
    all_roi = keep
    rtcc = RoiToChemicalCreator(ps, all_roi, n_peaks)
    dataset = rtcc.chemicals
    if output_dir is True:
        dataset_name = os.path.splitext(mzml_file)[0] + '.p'
        save_obj(dataset, dataset_name)
    return dataset
Ejemplo n.º 4
0
    def _get_distributions(self):
        mzml_file_object = MZMLFile(str(self.mzml_file_name))
        rt_bins = {}
        # mz_bins = {}
        for scan in mzml_file_object.scans:
            if not scan.ms_level == 1:
                continue
            mz, i = zip(*scan.peaks)
            total_intensity = sum(i)
            rt = scan.rt_in_seconds
            if rt < self.min_rt or rt > self.max_rt:
                continue
            rt_bin = int(rt)
            if rt_bin not in rt_bins:
                rt_bins[rt_bin] = total_intensity
            else:
                rt_bins[rt_bin] += total_intensity
        total_intensity = sum(rt_bins.values())
        self.rt_bins = [(k, k + 1) for k in rt_bins.keys()]
        self.rt_probs = [v / total_intensity for v in rt_bins.values()]

        good = make_roi(str(self.mzml_file_name), self.roi_params)
        log_roi_intensities = [np.log(max(r.intensity_list)) for r in good]
        log_roi_intensities = filter(
            lambda x: self.min_log_intensity <= x <= self.max_log_intensity,
            log_roi_intensities
        )
        log_roi_intensities = list(log_roi_intensities)
        hist, bin_edges = np.histogram(log_roi_intensities,
                                       bins=self.n_intensity_bins)
        total_i = hist.sum()
        hist = [h / total_i for h in hist]

        self.intensity_bins = [(b, bin_edges[i + 1]) for i, b in
                               enumerate(bin_edges[:-1])]
        self.intensity_probs = [h for h in hist]
Ejemplo n.º 5
0
def topn_processor():
    pathlist = []
    base_dir = 'documents/simple_ms1/example_data'
    # base_dir = 'example_data'
    mzml_path = os.path.join(base_dir, 'beers', 'fragmentation', 'mzML')
    file_name = 'Beer_multibeers_1_T10_POS.mzML'

    experiment_name = 'mzml_compare'
    experiment_out_dir = os.path.join(base_dir, 'results', experiment_name)
    min_rt = 0
    max_rt = 1441
    kde_min_ms1_intensity = 0  # min intensity to be selected for kdes
    kde_min_ms2_intensity = 0

    roi_mz_tol = 10
    roi_min_length = 1
    roi_min_intensity = 0
    roi_start_rt = min_rt
    roi_stop_rt = max_rt

    isolation_width = 1  # the (full) isolation width in Dalton around a selected precursor m/z
    ionisation_mode = POSITIVE
    N = 10
    rt_tol = 15
    mz_tol = 10
    min_ms1_intensity = 1.75E5  # minimum ms1 intensity to fragment

    mzml_filename = 'simulated.mzML'
    mzml_out = os.path.join(experiment_out_dir, mzml_filename)
    pathlist.append(mzml_out)

    print('#' * 10, 'Train densities')
    ds = DataSource()
    ds.load_data(mzml_path, file_name=file_name)
    bandwidth_mz_intensity_rt = 1.0
    bandwidth_n_peaks = 1.0
    ps = get_spectral_feature_database(ds, file_name, kde_min_ms1_intensity,
                                       kde_min_ms2_intensity, min_rt, max_rt,
                                       bandwidth_mz_intensity_rt,
                                       bandwidth_n_peaks)

    print('#' * 10, 'Extract all ROIs')
    mzml_file = os.path.join(mzml_path, file_name)
    good_roi, junk = make_roi(mzml_file,
                              mz_tol=roi_mz_tol,
                              mz_units='ppm',
                              min_length=roi_min_length,
                              min_intensity=roi_min_intensity,
                              start_rt=roi_start_rt,
                              stop_rt=roi_stop_rt)
    all_roi = good_roi + junk
    print('#' * 10, len(all_roi))

    keep = []
    for roi in all_roi:
        if np.count_nonzero(
                np.array(roi.intensity_list) > min_ms1_intensity) > 0:
            keep.append(roi)

    all_roi = keep

    set_log_level_debug()
    rtcc = RoiToChemicalCreator(ps, all_roi)
    data = rtcc.chemicals
    save_obj(data, os.path.join(experiment_out_dir, 'dataset.p'))

    set_log_level_warning()
    pbar = True
    mass_spec = IndependentMassSpectrometer(ionisation_mode, data, ps)
    controller = TopNController(ionisation_mode, N, isolation_width, mz_tol,
                                rt_tol, min_ms1_intensity)
    # create an environment to run both the mass spec and controller
    env = Environment(mass_spec, controller, min_rt, max_rt, progress_bar=True)

    # set the log level to WARNING so we don't see too many messages when environment is running
    set_log_level_warning()

    # run the simulation
    env.run()
    set_log_level_debug()
    env.write_mzML(experiment_out_dir, mzml_filename)

    print('#' * 10, 'Compare Results')
    matplotlib.use('agg')
    simulated_input_file = mzml_out
    simulated_mzs, simulated_rts, simulated_intensities, simulated_cumsum_ms1, simulated_cumsum_ms2 = count_stuff(
        simulated_input_file, min_rt, max_rt)

    real_input_file = mzml_file
    real_mzs, real_rts, real_intensities, real_cumsum_ms1, real_cumsum_ms2 = count_stuff(
        real_input_file, min_rt, max_rt)

    plt.rcParams.update({'font.size': 14})
    out_file = os.path.join(base_dir, 'results', 'topN_num_scans.png')
    pathlist.append(out_file)
    plot_num_scans(real_cumsum_ms1, real_cumsum_ms2, simulated_cumsum_ms1,
                   simulated_cumsum_ms2, out_file)

    mz_tol = None  # in ppm. if None, then 2 decimal places is used for matching the m/z
    rt_tol = 5  # seconds
    matches = match_peaklist(real_mzs, real_rts, real_intensities,
                             simulated_mzs, simulated_rts,
                             simulated_intensities, mz_tol, rt_tol)
    check_found_matches(matches, 'Real', 'Simulated')

    mz_tol = None
    rt_tol = 10
    matches = match_peaklist(real_mzs, real_rts, real_intensities,
                             simulated_mzs, simulated_rts,
                             simulated_intensities, mz_tol, rt_tol)
    check_found_matches(matches, 'Real', 'Simulated')

    mz_tol = None
    rt_tol = 15
    matches = match_peaklist(real_mzs, real_rts, real_intensities,
                             simulated_mzs, simulated_rts,
                             simulated_intensities, mz_tol, rt_tol)
    check_found_matches(matches, 'Real', 'Simulated')

    unmatched_intensities = []
    matched_intensities = []
    for key, value in list(matches.items()):
        intensity = key[2]
        if value is None:
            unmatched_intensities.append(intensity)
        else:
            matched_intensities.append(intensity)
    plt.rcParams.update({'font.size': 18})

    out_file = os.path.join(base_dir, 'results',
                            'topN_matched_intensities.png')
    plot_matched_intensities(matched_intensities, unmatched_intensities,
                             out_file)
    pathlist.append(out_file)
    out_file = os.path.join(base_dir, 'results', 'topN_matched_precursors.png')
    plot_matched_precursors(matches, 50, 1000, 180, 1260, out_file)
    pathlist.append(out_file)
    return pathlist
Ejemplo n.º 6
0
def varying_topn_processor():
    pathlist = []
    base_dir = 'documents/simple_ms1/example_data'
    mzml_path = os.path.join(base_dir, 'beers', 'fragmentation', 'mzML')
    file_name = 'Beer_multibeers_1_T10_POS.mzML'

    experiment_name = 'beer1pos'
    url_experiment_out_dir = os.path.join(base_dir, 'results', experiment_name,
                                          'mzML')
    experiment_out_dir = os.path.abspath(
        os.path.join(base_dir, 'results', experiment_name, 'mzML'))
    min_rt = 3 * 60  # start time when compounds begin to elute in the mzML file
    max_rt = 21 * 60
    kde_min_ms1_intensity = 0  # min intensity to be selected for kdes
    kde_min_ms2_intensity = 0

    roi_mz_tol = 10
    roi_min_length = 1
    roi_min_intensity = 0
    roi_start_rt = min_rt
    roi_stop_rt = max_rt

    isolation_window = 1  # the isolation window in Dalton around a selected precursor ion
    ionisation_mode = POSITIVE
    N = 10
    rt_tol = 15
    mz_tol = 10
    min_ms1_intensity = 1.75E5  # minimum ms1 intensity to fragment

    mzml_out = os.path.join(experiment_out_dir, 'simulated.mzML')
    print('#' * 10, 'Train densities')
    ds = DataSource()
    ds.load_data(mzml_path, file_name=file_name)
    bandwidth_mz_intensity_rt = 1.0
    bandwidth_n_peaks = 1.0
    ps = get_spectral_feature_database(ds, file_name, kde_min_ms1_intensity,
                                       kde_min_ms2_intensity, min_rt, max_rt,
                                       bandwidth_mz_intensity_rt,
                                       bandwidth_n_peaks)
    print('#' * 10, 'Extract all ROIs')
    mzml_file = os.path.join(mzml_path, file_name)
    good_roi, junk = make_roi(mzml_file,
                              mz_tol=roi_mz_tol,
                              mz_units='ppm',
                              min_length=roi_min_length,
                              min_intensity=roi_min_intensity,
                              start_rt=roi_start_rt,
                              stop_rt=roi_stop_rt)
    all_roi = good_roi + junk
    print('#' * 10, 'How many singleton and non-singleton ROIs =>',
          len([roi for roi in all_roi if roi.n == 1]))

    keep = []
    for roi in all_roi:
        if np.count_nonzero(
                np.array(roi.intensity_list) > min_ms1_intensity) > 0:
            keep.append(roi)

    all_roi = keep
    set_log_level_debug()
    rtcc = RoiToChemicalCreator(ps, all_roi)
    data = rtcc.chemicals
    save_obj(data, os.path.join(experiment_out_dir, 'dataset.p'))
    print('#' * 10, 'Run Top-N Controller')
    set_log_level_warning()
    pbar = False  # turn off progress bar
    Ns = [
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60,
        65, 70, 75, 80, 85, 90, 95, 100
    ]
    rt_tols = [15]
    params = get_params(experiment_name, Ns, rt_tols, mz_tol, isolation_window,
                        ionisation_mode, data, ps, min_ms1_intensity, min_rt,
                        max_rt, experiment_out_dir, pbar)
    run_serial_experiment(params)

    print('#' * 10, 'Analyse Results')
    min_ms1_intensity = 0
    rt_range = [(min_rt, max_rt)]
    mz_range = [(0, math.inf)]
    results_dir = os.path.join(base_dir, 'results', 'ground_truth', 'mzML')
    csv_file = os.path.join(results_dir, 'extracted_peaks_ms1.csv')
    P_peaks_df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range)

    csv_file = os.path.join(experiment_out_dir, 'extracted_peaks_ms1.csv')
    Q_peaks_df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range)

    fullscan_filename = 'Beer_multibeers_1_fullscan1.mzML'
    matching_mz_tol = 10  # ppm
    matching_rt_tol = 30  # seconds

    results = []
    for N in Ns:
        for rt_tol in rt_tols:

            # load chemicals and check for matching
            chemicals = load_obj(os.path.join(experiment_out_dir, 'dataset.p'))
            fragfile_filename = 'experiment_%s_N_%d_rttol_%d.mzML' % (
                experiment_name, N, rt_tol)

            # load controller and compute performance
            controller = load_controller(experiment_out_dir, experiment_name,
                                         N, rt_tol)
            mytemp = os.path.join(url_experiment_out_dir, fragfile_filename)
            pathlist.append(mytemp)

            if controller is not None:
                tp, fp, fn, prec, rec, f1 = compute_performance_scenario_2(
                    controller, chemicals, min_ms1_intensity,
                    fullscan_filename, fragfile_filename, P_peaks_df,
                    Q_peaks_df, matching_mz_tol, matching_rt_tol)
                print(
                    '%s N=%d rt_tol=%d tp=%d fp=%d fn=%d prec=%.3f rec=%.3f f1=%.3f'
                    % (experiment_name, N, rt_tol, tp, fp, fn, prec, rec, f1))
                res = (experiment_name, N, rt_tol, tp, fp, fn, prec, rec, f1)
                results.append(res)
    result_df = pd.DataFrame(results,
                             columns=[
                                 'experiment', 'N', 'rt_tol', 'TP', 'FP', 'FN',
                                 'Prec', 'Rec', 'F1'
                             ])

    plt.figure(figsize=(12, 6))
    ax = sns.lineplot(x='N',
                      y='Prec',
                      hue='experiment',
                      legend='brief',
                      data=result_df)
    plt.title('Top-N Precision')
    for l in ax.lines:
        plt.setp(l, linewidth=5)
    plt.ylabel('Precision')
    plt.xlabel(r'Top-$N$')
    plt.legend(prop={'size': 20})
    plt.tight_layout()

    fig_out = os.path.join(experiment_out_dir, 'topN_precision.png')
    plt.savefig(fig_out, dpi=300)

    plt.figure(figsize=(12, 6))
    ax = sns.lineplot(x='N',
                      y='Rec',
                      hue='experiment',
                      legend='brief',
                      data=result_df)
    plt.title('Top-N Recall')
    for l in ax.lines:
        plt.setp(l, linewidth=5)
    plt.ylabel('Recall')
    plt.xlabel(r'Top-$N$')
    plt.legend(prop={'size': 20})
    plt.tight_layout()

    fig_out = os.path.join(experiment_out_dir, 'topN_recall.png')

    plt.figure(figsize=(12, 6))
    ax = sns.lineplot(x='N',
                      y='F1',
                      hue='experiment',
                      legend='brief',
                      data=result_df)
    plt.title('Top-N F1')
    for l in ax.lines:
        plt.setp(l, linewidth=5)
    plt.ylabel(r'$F_{1}\;score$')
    plt.xlabel(r'Top-$N$')
    plt.legend(prop={'size': 20})
    plt.tight_layout()

    fig_out = os.path.join(experiment_out_dir, 'topN_f1.png')
    plt.savefig(fig_out, dpi=300)

    return pathlist
Ejemplo n.º 7
0
 def _extract_rois(self):
     good = make_roi(str(self.mzml_file_name), self.roi_params)
     logger.debug("Extracted {} good ROIs from {}".format(
         len(good), self.mzml_file_name))
     return good