def _get_peaks(spectrum): mzs = spectrum.mz rts = [get_rt(spectrum)] * len(mzs) intensities = spectrum.i peaklist = np.stack([mzs, rts, intensities], axis=1) return peaklist
def get_precursor_info(fragfile): """ Get (MS1) precursor peaks and their associated MS2 scans from an mzML file :param fragfile: path to an mzML file :return: a pandas dataframe that contains all the ms1 and ms2 information """ run = pymzml.run.Reader(fragfile, obo_version='4.0.1', MS1_Precision=5e-6, extraAccessions=[('MS:1000016', ['value', 'unitName'])]) last_ms1_peaklist = None last_ms1_scan_no = 0 isolation_width = 1.0 # Dalton data = [] for scan_no, scan in enumerate(run): if scan.ms_level == 1: # save the last ms1 scan that we've seen last_ms1_peaklist = _get_peaks(scan) last_ms1_scan_no = scan_no # TODO: it's better to use the "isolation window target m/z" field in the mzML file for matching precursors = scan.selected_precursors if len(precursors) > 0: assert len( precursors ) == 1 # assume exactly 1 precursor peak for each ms2 scan precursor = precursors[0] try: scan_rt = get_rt(scan) precursor_mz = precursor['mz'] precursor_intensity = precursor['i'] res = _find_precursor_peaks(precursor, last_ms1_peaklist, last_ms1_scan_no, isolation_width=isolation_width) ms2_peaklist = _get_peaks(scan) row = [ scan_no, scan_rt, precursor_mz, precursor_intensity, ms2_peaklist ] row.extend(res) data.append(row) except ValueError as e: logger.warning(e) except KeyError as e: continue # sometimes we can't find the intensity value precursor['i'] in precursors columns = [ 'ms2_scan_id', 'ms2_scan_rt', 'ms2_precursor_mz', 'ms2_precursor_intensity', 'ms2_peaklist', 'ms1_scan_id', 'ms1_scan_rt', 'ms1_mz', 'ms1_intensity' ] df = pd.DataFrame(data, columns=columns) # select only rows where we are sure of the matching, i.e. the intensity values aren't too different df['intensity_diff'] = np.abs(df['ms2_precursor_intensity'] - df['ms1_intensity']) idx = (df['intensity_diff'] < 0.1) ms1_df = df[idx] return ms1_df