Esempi in Python per MzML, esempi in Python per pyteomics.mzml.MzML

Esempio n. 1

0

Mostra file

def read_scans(mzml_file, ms_levels=(1, 2), should_renumber_ifmissing=True):
    """
    yields all spectra from an mzML file with level in ms_levels, or
    all processable scans if ms_levels not specified
    :param mzml_file:
    :param ms_levels:
    :param min_pprophet:
    :param should_renumber_ifmissing: If this is true and we're unable to get integer
    scan numbers from the spactra, renumber them from 1 to N. If false, fail if we
    can't parse an integer from the scan number field
    :return:
    """
    with mzml.MzML(mzml_file) as reader:
        cur_scanidx_1based = 0
        for scan in reader:
            cur_scanidx_1based += 1
            if scan['ms level'] in ms_levels:
                # ignore this scan if we get a ValueError.
                # ValueError is only raised if we can't infer charge.
                # If we still have enough scans where we could infer charge, OK to
                # ignore these.
                try:
                    yield read_scan(scan,
                                    default_scan_number=cur_scanidx_1based
                                    if should_renumber_ifmissing else None)
                except ValueError as e:
                    logger.debug("Warning! Failed to read scan: %s" % e)

Esempio n. 2

0

Mostra file

def readmzXML(file_paths):
    
    data_dict = {}
    for file_path in file_paths:
        data = mzml.MzML(file_path)
        file_num = len(data)

        for spectrum in data:
            data_dict[spectrum['spectrum title']] = {'m/z':spectrum['m/z array'],'intensity':spectrum['intensity array']}

        print('successfully loaded '+str(file_num)+' raw spectra from'+file_path)

    return data_dict

Esempio n. 3

0

Mostra file

def get_spectra(source: Union[IO, str], scan_nrs: Sequence[int] = None)\
        -> Iterator[MsmsSpectrum]:
    """
    Get the MS/MS spectra from the given mzML file, optionally filtering by
    scan number.

    Parameters
    ----------
    source : Union[IO, str]
        The mzML source (file name or open file object) from which the spectra
        are read.
    scan_nrs : Sequence[int]
        Only read spectra with the given scan numbers. If `None`, no filtering
        on scan number is performed.

    Returns
    -------
    Iterator[MsmsSpectrum]
        An iterator over the requested spectra in the given file.
    """
    with mzml.MzML(source) as f_in:
        # Iterate over a subset of spectra filtered by scan number.
        if scan_nrs is not None:

            def spectrum_it():
                for scan_nr in scan_nrs:
                    yield f_in.get_by_id(
                        f'controllerType=0 controllerNumber=1 scan={scan_nr}')

        # Or iterate over all MS/MS spectra.
        else:

            def spectrum_it():
                for spectrum_dict in f_in:
                    if int(spectrum_dict.get('ms level', -1)) == 2:
                        yield spectrum_dict

        try:
            for spectrum in spectrum_it():
                try:
                    yield _parse_spectrum(spectrum)
                except ValueError as e:
                    pass
                    # logger.warning(f'Failed to read spectrum %s: %s',
                    #                spectrum['id'], e)
        except LxmlError as e:
            logger.warning('Failed to read file %s: %s', source, e)

Esempio n. 4

0

Mostra file

def extract_from_mzml(path):
    # Extract the data from the mzml, if we havnt already
    if not os.path.exists(f'{path}mzML.json'):
        if not multiprocessing:
            print(
                'Extracting data from mzML                                                    ',
                end='\r')
        data = mzml.MzML(f'{path}file.mzML')

        # Extracted data
        extracted = {'ms1': {}, 'ms2': {}}
        # Extract the necessary data from spectra
        for spectrum in data:
            if spectrum['ms level'] == 1:
                # Scan id
                scan_id = int(spectrum['id'].split('scan=')[1])

                # Deal with ms level 1 spectra
                ms1_spectrum = process_ms1(spectrum)
                extracted['ms1'][scan_id] = {
                    'mz': ms1_spectrum['mz'],
                    'intensity': ms1_spectrum['intensity'],
                    'scan_time': ms1_spectrum['scan_time']
                }

            elif spectrum['ms level'] == 2:
                # Scan id
                scan_id = int(spectrum['id'].split('scan=')[1])

                # Deal with ms level 1 spectra
                ms2_spectrum = process_ms2(spectrum)
                extracted['ms2'][scan_id] = {
                    'scan_index': ms2_spectrum['scan_index'],
                    'precursor_scan': ms2_spectrum['precursor_scan'],
                    'precursor_ion': ms2_spectrum['precursor_ion'],
                    'm/z_array': [mz for mz in ms2_spectrum['m/z']],
                    'rt_array': [rt for rt in ms2_spectrum['rt']]
                }

            else:
                pass

        with gzip.GzipFile(f'{path}mzML.json', 'w') as fout:
            fout.write(json.dumps(extracted).encode('utf-8'))
        fout.close()

Esempio n. 5

0

Mostra file

File: io.py Progetto: LSARP/lrg-omics

def mzml_to_pandas_df(filename):
    """
    Reads mzML file and returns a pandas.DataFrame.
    """
    cols = ["retentionTime", "m/z array", "intensity array"]
    slices = []
    file = mzml.MzML(filename)
    while True:
        try:
            data = file.next()
            data["retentionTime"] = data["scanList"]["scan"][0]["scan time"] / 60
            del data["scanList"]
            slices.append(pd.DataFrame(data))
        except:
            break
    df = pd.concat(slices)[cols]
    df_to_numeric(df)
    return df

Esempio n. 6

0

Mostra file

File: io.py Progetto: marioernestovaldes/lrg-omics

def mzml_to_pandas_df(filename):
    '''
    Reads mzML file and returns a pandas.DataFrame.
    '''
    cols = ['retentionTime', 'm/z array', 'intensity array']
    slices = []
    file = mzml.MzML(filename)
    while True:
        try:
            data = file.next()
            data['retentionTime'] = data['scanList']['scan'][0][
                'scan time'] / 60
            del data['scanList']
            slices.append(pd.DataFrame(data))
        except:
            break
    df = pd.concat(slices)[cols]
    df_to_numeric(df)
    return df

Esempio n. 7

0

Mostra file

def mzml_to_pandas_df_pyteomics(fn):
    '''
    Reads mzML file and returns a pandas.DataFrame.
    '''
    cols = ['retentionTime', 'm/z array', 'intensity array']
    slices = []
    with mzml.MzML(fn) as ms_data:
        while True:
            try:
                data = ms_data.next()
                data['retentionTime'] = data['scanList']['scan'][0][
                    'scan time'] / 60
                del data['scanList']
                slices.append(pd.DataFrame(data))
            except:
                break
    df = pd.concat(slices)[cols]
    df_to_numeric(df)
    df['intensity array'] = df['intensity array'].astype(int)
    df = df.reset_index(drop=True)
    return df

Esempio n. 8

0

Mostra file

File: extractor.py Progetto: veitveit/MassSpecPipeline

def internalmzML(path):
    # Extract the data from the mzml, if we havnt already
    if not os.path.exists(f'{path}mzML.json'):
        if not multithread:
            print(
                'Extracting data from mzML                                                    ',
                end='\r')
        data = mzml.MzML(f'{path}file.mzML')

        # Extracted data
        extracted = {'ms1': {}}  # , 'ms2': {}}
        # Extract the necessary data from spectra
        for spectrum in data:
            if spectrum['ms level'] == 1:
                # Scan id
                scan_id = int(spectrum['id'].split('scan=')[1])

                # Deal with ms level 1 spectra
                ms1_spectrum = process_ms1(spectrum)
                extracted['ms1'][scan_id] = {
                    'mz': ms1_spectrum['mz'],
                    'intensity': ms1_spectrum['intensity'],
                    'scan_time': ms1_spectrum['scan_time']
                }
            else:
                pass
            # elif spectrum['ms level'] == 2:
            #     # Scan id
            #     scan_id = int(spectrum['id'].split('scan=')[1])
            #
            #     # Deal with ms level 1 spectra
            #     ms1_spectrum = process_ms1(spectrum)
            #     extracted['ms1'][scan_id] = {'mz': process_ms2['precursor_scan'],
            #                                  'intensity': process_ms2['precursor_ion'],
            #                                  'scan_time': process_ms2['scan_index']}

        with gzip.GzipFile(f'{path}mzML.json', 'w') as fout:
            fout.write(json.dumps(extracted).encode('utf-8'))
        fout.close()
        os.remove(f'{path}file.mzML')

Esempio n. 9

0

Mostra file

def read(mzml_file: str, max_peaks: int = None, min_intensity: float = None)\
         -> Tuple[np.ndarray, DIAScan]:
    """
    Read an mzML file from a DIA experiment.

    Parameters
    ----------
    mzml_file : str
        The mzML file to read.

    Returns
    -------
    diadem.dataset.DIARun
        A DIARun object containg the raw data.
    """
    kwargs = {"max_peaks": max_peaks, "min_intensity": min_intensity}
    with mzml.MzML(mzml_file) as mz_dat:
        scans = DIARun([
            s for s in _pbar(mz_dat.map(_mkscan, kwargs=kwargs, processes=4))
        ])

    return scans

Esempio n. 10

0

Mostra file

def qc1_main():
    argparser = ArgumentParser(description="iRT peptide QC tool")
    argparser.add_argument('--mzml', type=str, required=True, help="MzML file")
    argparser.add_argument('--targets',
                           type=str,
                           required=True,
                           help="Targets file")
    argparser.add_argument('--ms1-ppm',
                           type=float,
                           default=5,
                           help="MS1 extraction window in ppm")
    argparser.add_argument('--ms2-prec-tolerance',
                           type=float,
                           default=0.01,
                           help="MS2 precursor tolerance")
    argparser.add_argument('--ms2-frag-tolerance',
                           type=float,
                           default=1,
                           help="MS2 precursor tolerance")
    argparser.add_argument('--width-1-pc',
                           type=float,
                           default=50,
                           help="Chromatographic width 1 in %% of apex")
    argparser.add_argument('--width-2-pc',
                           type=float,
                           default=5,
                           help="Chromatographic width 2 in %% of apex")
    argparser.add_argument('--debug',
                           action="store_true",
                           help="Pickle cache input file")
    argparser = argparser.parse_args()

    b_fname = ".".join(argparser.mzml.split(".")[:-1])
    pdf = PdfPages(b_fname + "_Figs.pdf")

    if argparser.debug:
        import pickle
        import time
        import os

        if os.path.exists(argparser.mzml + ".pkl"):
            with open(argparser.mzml + ".pkl", "rb") as f_:
                _start_time = time.time()
                print("Unpickling")
                exp = lcmsms.LCMSMSExperiment(
                    tqdm.tqdm(pickle.load(f_)),
                    prec_tolerance=argparser.ms2_prec_tolerance)
                print(f"Unpickled in {time.time()-_start_time} seconds")
        else:
            print("Reading and pickling")
            mzml_ = list(tqdm.tqdm(mzml.MzML(argparser.mzml)))
            with open(argparser.mzml + ".pkl", "wb") as f_:
                pickle.dump(mzml_, f_)
            print("Pickled, parsing experiment")
            exp = lcmsms.LCMSMSExperiment(
                tqdm.tqdm(mzml_), prec_tolerance=argparser.ms2_prec_tolerance)
            del mzml_
    else:
        exp = lcmsms.LCMSMSExperiment(
            tqdm.tqdm(mzml.MzML(argparser.mzml)),
            prec_tolerance=argparser.ms2_prec_tolerance)

    ###  MS1 processing  ####
    targets = pd.read_csv(argparser.targets, sep='\t')
    targets_ms1 = targets[["Sequence", "Precursor_Mz"]].drop_duplicates()
    results_ms1 = pd.DataFrame(columns=[
        "Sequence",
        "Precursor_Mz",
        "Apex_time",
        f"Width_{argparser.width_1_pc}_pc_time_start",
        f"Width_{argparser.width_1_pc}_pc_time_end",
        f"Width_{argparser.width_1_pc}_xic_area",
        f"Width_{argparser.width_2_pc}_pc_time_start",
        f"Width_{argparser.width_2_pc}_pc_time_end",
        f"Width_{argparser.width_2_pc}_xic_area",
        f"MS1_mass_apex_mz",
        f"MS1_apex_height",
        f"MS1_peak_halfwidth",
        f"MS1_peak_area",
        f"TIC_MS2",
    ])

    #from matplotlib.backends.backend_pdf import PdfPages
    #pdf = PdfPages('MS1.pdf')

    fig, axs = plt.subplots(len(targets_ms1),
                            4,
                            figsize=(15, 60),
                            gridspec_kw={'width_ratios': [1, 1, 1, 1]})
    plt.subplots_adjust(hspace=0.5)

    n = 0

    for k, row in targets_ms1.iterrows():
        mz = row["Precursor_Mz"]
        seq = row["Sequence"]
        ch = exp.ms1.xic(mz, argparser.ms1_ppm)
        chs = ch.smooth(sigma=2)
        apext, apexi = ch.get_apex()
        width1 = chs.get_width_pc(argparser.width_1_pc)
        width2 = chs.get_width_pc(argparser.width_2_pc)
        area1 = chs.get_width_pc_area(argparser.width_1_pc)
        area2 = chs.get_width_pc_area(argparser.width_2_pc)
        spec = exp.ms1[apext]
        ms1_apex_mz, ms1_apex_int = spec.get_apex_around(mz, 0.05)
        ms1_hw = spec.get_apex_width_pc(mz, apex_pc=50, tolerance=0.05)
        ms1_area = spec.get_peak_area(mz, tolerance=0.05)

        ### PLOTS ###
        # XIC
        axs[n, 0].ticklabel_format(axis="y", style='sci', scilimits=(0, 0))
        axs[n, 0].ticklabel_format(axis="x", style='plain')
        axs[n, 0].plot(ch.t, ch.i, "g-")
        #axs[n, 0].plot(xictimes, xic)
        #axs[n, 0].plot(xictimes, asym_peak(xictimes, *popt), 'r-')
        axs[n, 0].vlines(apext, 0, apexi * 1.1)
        axs[n, 0].title.set_text(f"{seq}\nmz={mz:.4f}\napex@{apext:.2f}min")
        axs[n, 0].set_xlim(15, 30)

        # XIC zoom
        axs[n, 1].ticklabel_format(axis="y", style='sci', scilimits=(0, 0))
        axs[n, 1].ticklabel_format(axis="x", style='plain')
        axs[n, 1].plot(ch.t, ch.i, "gx-")
        axs[n, 1].plot(chs.t, chs.i, "rx-")
        #axs[n, 1].plot(xictimes, asym_peak(xictimes, *popt), 'r-')
        axs[n, 1].vlines(apext, 0, apexi)
        axs[n, 1].title.set_text(f"MS1 XIC zoon\n mz={mz:.4f}")
        axs[n, 1].hlines(apexi * 0.5, *width1)
        axs[n, 1].hlines(apexi * 0.05, *width2)
        axs[n, 1].set_xlim(apext - 0.2, apext + 0.4)
        axs[n, 1].text(0.45,
                       0.95,
                       f"Area50={area1:.3e}\nArea5  ={area2:.3e}",
                       transform=axs[n, 1].transAxes,
                       fontsize=10,
                       verticalalignment='top')

        # MS1 spectrum
        spec = exp.ms1[apext]
        axs[n, 2].ticklabel_format(axis="y", style='sci', scilimits=(0, 0))
        axs[n, 2].ticklabel_format(axis="x", style='plain')
        axs[n, 2].title.set_text(f"MS1 spectrum\n@time={apext:.2f}min")
        spec.plot(ax=axs[n, 2], marks=[ms1_apex_mz])

        # MS1 spectrum zoom
        ms1_tolerance = ms1_apex_mz * argparser.ms1_ppm * 1e-6
        spec_zoom = spec[ms1_apex_mz -
                         argparser.ms2_prec_tolerance:ms1_apex_mz +
                         argparser.ms2_prec_tolerance]  # No /2 (sic!)
        axs[n, 3].ticklabel_format(axis="y", style='sci', scilimits=(0, 0))
        axs[n, 3].ticklabel_format(axis="x",
                                   style='sci',
                                   scilimits=(-3, -3),
                                   useOffset=ms1_apex_mz)
        #axs[n, 3].xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
        spec_zoom.plot("go-", ax=axs[n, 3])
        axs[n, 3].title.set_text("MS1 zoom\n mz={:.4f}".format(mz))
        axs[n, 3].vlines(ms1_apex_mz, 0, ms1_apex_int, "r")
        ms1_w_left, ms1_w_right = spec.get_apex_times_pc(ms1_apex_mz,
                                                         apex_pc=50,
                                                         tolerance=0.05)
        axs[n, 3].hlines(ms1_apex_int / 2, ms1_w_left, ms1_w_right, "r")
        axs[n, 3].text(0.55,
                       0.95,
                       f"Area={ms1_area:.2e}\nHW={ms1_hw:.2e}",
                       transform=axs[n, 3].transAxes,
                       fontsize=10,
                       verticalalignment='top')

        n += 1
        ############

        row['Apex_time'] = apext
        row[f"Width_{argparser.width_1_pc}_pc_time_start"] = width1[0]
        row[f"Width_{argparser.width_1_pc}_pc_time_end"] = width1[1]
        row[f"Width_{argparser.width_1_pc}_xic_area"] = area1
        row[f"Width_{argparser.width_2_pc}_pc_time_start"] = width2[0]
        row[f"Width_{argparser.width_2_pc}_pc_time_end"] = width2[1]
        row[f"Width_{argparser.width_2_pc}_xic_area"] = area2
        row[f"MS1_mass_apex_mz"] = ms1_apex_mz
        row[f"MS1_apex_height"] = ms1_apex_int
        row[f"MS1_peak_halfwidth"] = ms1_hw
        row[f"MS1_peak_area"] = ms1_area

        results_ms1 = results_ms1.append(row)

    pdf.savefig(fig)
    plt.close(fig)

    ###  MS2 processing  ###
    results_ms1.set_index("Sequence", drop=True, inplace=True)
    targets_ms2 = targets[["Sequence", "Precursor_Mz",
                           "Product_Mz"]].drop_duplicates()
    results_ms2 = pd.DataFrame(columns=[
        "Sequence",
        "Precursor_Mz",
        "Product_Mz",
        "MS2_TIC_Apex_time",
        "MS2_mass_apex_mz",
        "MS2_apex_height",
        "MS2_peak_halfwidth",
        "MS2_peak_area",
    ])
    n_ = max(
        map(lambda x: len(x[1]),
            targets_ms2.groupby(by=["Sequence", "Precursor_Mz"])))
    fig, axs = plt.subplots(len(targets_ms1), 3 + n_, figsize=(15, 100))
    plt.subplots_adjust(hspace=0.5)
    plt.subplots_adjust(hspace=0.5)

    n = -1
    for k, grp in targets_ms2.groupby(by=["Sequence", "Precursor_Mz"]):
        n += 1
        seq = k[0]
        prec = k[1]
        apext = results_ms1.loc[seq, "Apex_time"]
        start = results_ms1.loc[seq,
                                f"Width_{argparser.width_2_pc}_pc_time_start"]
        stop = results_ms1.loc[seq,
                               f"Width_{argparser.width_2_pc}_pc_time_end"]

        ms2_all = exp.ms2.extract(prec)
        ms2_ext = ms2_all[start - argparser.ms2_frag_tolerance / 2:stop +
                          argparser.ms2_frag_tolerance / 2]

        #spec = ms2_ext[apext]
        tic_apext, tic_apexint = ms2_ext.tic.get_apex()
        results_ms1.loc[seq, "TIC_MS2"] = tic_apexint
        spec = ms2_ext[tic_apext]

        ### PLOTS ###
        # TIC MS2
        axs[n, 0].ticklabel_format(axis="y", style='sci', scilimits=(0, 0))
        axs[n, 0].ticklabel_format(axis="x", style='plain')
        axs[n, 0].plot(ms2_all.tic.t, ms2_all.tic.i, "g-")
        axs[n, 0].title.set_text("TIC MS2\nmz={:.4f}\n apex@{:.2f}\n".format(
            prec, tic_apext))
        # TIC MS2 zoom
        axs[n, 1].ticklabel_format(axis="y", style='sci', scilimits=(0, 0))
        axs[n, 1].ticklabel_format(axis="x", style='plain')
        axs[n, 1].plot(ms2_ext.tic.t, ms2_ext.tic.i, "g-")
        axs[n, 1].vlines(tic_apext, 0, tic_apexint, "r")
        axs[n, 1].title.set_text(
            f"TIC MS2 zoom\nmz={prec:.4f}\n apex@{tic_apext:.2f}\n")
        # MS2 spectrum
        axs[n, 2].ticklabel_format(axis="y", style='sci', scilimits=(0, 0))
        axs[n, 2].ticklabel_format(axis="x", style='plain')
        spec.plot(ax=axs[n, 2])
        axs[n, 2].title.set_text(
            f"MS/MS for\n{prec:.4f}\n@time={tic_apext:.2f}min\n")

        nn = 3
        for kk, row in grp.iterrows():
            frag = row["Product_Mz"]
            try:
                fmz, fint = spec.get_apex_around(frag,
                                                 argparser.ms2_frag_tolerance)
                f_hw = spec.get_apex_width_pc(
                    frag, apex_pc=50, tolerance=argparser.ms2_frag_tolerance)
                f_area = spec.get_peak_area(
                    fmz, tolerance=argparser.ms2_frag_tolerance)
                s_ext = spec[fmz - argparser.ms2_frag_tolerance / 2:fmz +
                             argparser.ms2_frag_tolerance / 2]
                s_ext.plot("go-", ax=axs[n, nn])
                axs[n, nn].ticklabel_format(axis="y",
                                            style='sci',
                                            scilimits=(0, 0))
                #axs[n, nn].ticklabel_format(axis="x", style='sci', scilimits=(-2,-2), useOffset=fmz)
                axs[n, nn].ticklabel_format(axis="x", style='plain')
                axs[n, nn].title.set_text(f"MS2 zoom\nmz={fmz:.4f}\n")
                axs[n, nn].vlines(fmz, 0, max(s_ext.i), "r")
                axs[n, nn].vlines(frag, 0, max(s_ext.i), "blue")
                axs[n, 2].plot([frag], [fint], "rx")  #, markersize=15)
                ms2_w_left, ms2_w_right = s_ext.get_apex_times_pc(
                    fmz, apex_pc=50, tolerance=0.05)
                axs[n, nn].hlines(fint / 2, ms2_w_left, ms2_w_right, "r")
                axs[n, nn].text(0.6,
                                0.98,
                                f"Area=\n{ms1_area:.2e}\n\nHW=\n{ms1_hw:.2e}",
                                transform=axs[n, nn].transAxes,
                                fontsize=8,
                                verticalalignment='top')

            except lcmsms.PeaksNotFound:
                print(f"No MS2 peak for {prec:.4f}/{frag:.4f}")
                f_hw = 0
                f_area = 0
                fmz = 0
                fint = 0
            nn += 1

            row["MS2_TIC_Apex_time"] = tic_apext
            row["MS2_mass_apex_mz"] = fmz
            row["MS2_apex_height"] = fint
            row["MS2_peak_halfwidth"] = f_hw
            row["MS2_peak_area"] = f_area
            results_ms2 = results_ms2.append(row)

    pdf.savefig(fig)
    plt.close(fig)

    results_ms2.set_index("Sequence", drop=True, inplace=True)
    fig.savefig("MS1.pdf", dpi=1200, format='pdf', bbox_inches='tight')
    ms1_fname = b_fname + "_MS1_table.csv"
    ms2_fname = b_fname + "_MS2_table.csv"
    results_ms1.to_csv(ms1_fname, sep='\t')
    results_ms2.to_csv(ms2_fname, sep='\t')

    pdf.close()

Esempio n. 11

0

Mostra file

mzid_df = pd.DataFrame({'file': file_location, 'id': spectrum_ids, 'seq': seq})


def _parse_mzml_entry(entry):
    ID = str(entry['id'])
    mz = np.array(entry['m/z array'])
    intensities = np.array(entry['intensity array'])
    return ID, mz, intensities


all_spectra = []

for file in np.unique(file_location):
    print(file)
    indexed = mzml.MzML(file)
    for i, entry in enumerate(indexed.map(_parse_mzml_entry)):
        tupl = (file, ) + entry
        all_spectra.append(tupl)

mzml_location, ids, mz, intensities = zip(*all_spectra)

spectra_df = pd.DataFrame({
    'file': mzml_location,
    'id': ids,
    'mz': mz,
    'intensities': intensities
})

#### MERGE: mzid + mzml

Esempio n. 12

0

Mostra file

File: convert_raw_spectra.py Progetto: jiahao95/project_lab-ss2020

import numpy as np
import pandas as pd


# %%
# define parsing function
def _parse_mzml_entry(entry):
    ID = str(entry['id'])
    mz = np.array(entry['m/z array'])
    intensities = np.array(entry['intensity array'])
    return ID, mz, intensities


all_spectra = []
data = '/home/ubuntu/data/jiahao/trp/output/Run1_U4_2000ng.mzML'
file = mzml.MzML(data)
for i, entry in enumerate(file.map(_parse_mzml_entry)):
    tupl = (data, ) + entry
    all_spectra.append(tupl)

# %%
# generate pandas dataframe
mzml_location, ids, mz, intensities = zip(*all_spectra)

spectra_df = pd.DataFrame({
    'file': mzml_location,
    'id': ids,
    'mz': mz,
    'intensities': intensities
})