# In[12]:

expr_codes = ["a0806_077", "a0806_078", "a0806_079"]
# expr_codes = ["a0806_140", "a0806_141", "a0806_142"]

# Loop over the experiments and perform the processing.

# In[13]:

for expr_code in expr_codes:

    print(f" -> Processing experiment '{expr_code}'")

    andi_file = data_directory / f"{expr_code}.cdf"

    data = ANDI_reader(andi_file)

    im = build_intensity_matrix_i(data)

    n_scan, n_mz = im.size

    # Preprocess the data (Savitzky-Golay smoothing and Tophat baseline detection)

    for ii in range(n_mz):
        ic = im.get_ic_at_index(ii)
        ic1 = savitzky_golay(ic)
        ic_smooth = savitzky_golay(ic1)  # Why the second pass here?
        ic_bc = tophat(ic_smooth, struct="1.5m")
        im.set_ic_at_index(ii, ic_bc)

    # Peak detection
Esempio n. 2
0
def andi(datadir):
    print("data")
    return ANDI_reader(datadir / "gc01_0812_066.cdf")
Esempio n. 3
0
from pyms.Display import Display
from pyms.GCMS.IO.ANDI import ANDI_reader
from pyms.IntensityMatrix import build_intensity_matrix_i
from pyms.Noise.SavitzkyGolay import savitzky_golay
from pyms.Peak.Function import peak_sum_area
from pyms.Simulator import gcms_sim
from pyms.TopHat import tophat

data_directory = pathlib.Path(".").resolve().parent.parent / "pyms-data"
# Change this if the data files are stored in a different location

output_directory = pathlib.Path(".").resolve() / "output"

# read raw data
andi_file = data_directory / "data/gc01_0812_066.cdf"
data = ANDI_reader(andi_file)

data.trim(4101, 4350)

# Build Intensity Matrix
real_im = build_intensity_matrix_i(data)

n_scan, n_mz = real_im.size

 # perform necessary pre filtering
for ii in range(n_mz):
    ic = real_im.get_ic_at_index(ii)
    ic_smooth = savitzky_golay(ic)
    ic_bc = tophat(ic_smooth, struct="1.5m")
    real_im.set_ic_at_index(ii, ic_bc)
Esempio n. 4
0
import pathlib

data_directory = pathlib.Path(".").resolve().parent.parent / "pyms-data"
# Change this if the data files are stored in a different location

output_directory = pathlib.Path(".").resolve() / "output"

from pyms.GCMS.IO.ANDI import ANDI_reader

# Read the raw ANDI-MS data

# In[2]:

andi_file = data_directory / "gc01_0812_066.cdf"
data = ANDI_reader(andi_file)
print(data)

# ### A GCMS_data Object
#
# The object ``data`` (from the two previous examples) stores the raw data as a
# |pyms.GCMS.Class.GCMS_data| object.
# Within the |GCMS_data|
# object, raw data are stored as a list of
# |pyms.Spectrum.Scan| objects and a list of
# retention times. There are several methods available to access data and
# attributes of the |GCMS_data|
# and |Scan| objects.
#
# The |GCMS_data| object's methods relate to the raw data.
# The main properties relate to the masses, retention times and scans. For example, the
Esempio n. 5
0
"""proc.py
"""

# This file has been replaced by jupyter/reading_andi.ipynb

import pathlib
data_directory = pathlib.Path(".").resolve().parent.parent / "pyms-data"
# Change this if the data files are stored in a different location

from pyms.GCMS.IO.ANDI import ANDI_reader

# read the raw data
andi_file = data_directory / "gc01_0812_066.cdf"
data = ANDI_reader(andi_file)

# print info
data.info()

# write data to output file. This will create
# two ascii data tables, data.I.csv and data.mz.csv
# with intensities and m/z values
data.write("output/data")
Esempio n. 6
0
"""proc.py
"""
# This file has been replaced by jupyter/NoiseSmoothing.ipynb

from pyms.GCMS.IO.ANDI import ANDI_reader
from pyms.Noise.SavitzkyGolay import savitzky_golay

# read the raw data
andi_file = "data/gc01_0812_066.cdf"
data = ANDI_reader(andi_file)

# get the TIC
tic = data.get_tic()

tic1 = savitzky_golay(tic)

tic.write("output/tic.dat", minutes=True)
tic1.write("output/tic1.dat", minutes=True)
Esempio n. 7
0
def missing_peak_finder(sample,
                        file_name,
                        points=3,
                        null_ions=None,
                        crop_ions=None,
                        threshold=1000,
                        rt_window=1,
                        filetype=MZML):
    """
	Integrates raw data around missing peak locations to fill NAs in the data matrix

	:param sample: The sample object containing missing peaks
	:type sample: :class:`pyms.Gapfill.Class.Sample`

	:param file_name: Name of the raw data file
	:type file_name: str
	:param points: Peak finding - Peak if maxima over 'points' number of scans. Default ``3``
	:type points: int, optional
	:param  null_ions: Ions to be deleted in the matrix. Default ``[73, 147]``
	:type null_ions: list, optional
	:param crop_ions: Range of Ions to be considered. Default ``[50, 540]``
	:type crop_ions: list, optional
	:param threshold: Minimum intensity of IonChromatogram allowable to fill. Default ``1000``
	:type threshold: int, optional
	:param  rt_window: Window in seconds around average RT to look for. Default ``1``
	:type rt_window: float, optional
	:param filetype: either `MZML` (default) or `NETCDF`
	:type filetype: int, optional

	:author: Sean O'Callaghan
	"""

    if not null_ions:
        null_ions = [73, 147]
    if not crop_ions:
        crop_ions = [50, 540]

    # TODO: some error checks on null and crop ions

    # TODO: a for root,files,dirs in os.path.walk(): loop
    print("Sample:", sample.get_name(), "File:", file_name)

    if filetype.lower() == 'cdf':
        from pyms.GCMS.IO.ANDI import ANDI_reader
        data = ANDI_reader(file_name)
    elif filetype.lower() == 'mzml':
        from pyms.GCMS.IO.MZML import mzML_reader
        data = mzML_reader(file_name)
    else:
        print("file type not valid")

    # build integer intensity matrix
    im = build_intensity_matrix_i(data)

    for null_ion in null_ions:
        im.null_mass(null_ion)

    im.crop_mass(crop_ions[0], crop_ions[1])

    # get the size of the intensity matrix
    n_scan, n_mz = im.size

    # smooth data
    for ii in range(n_mz):
        ic = im.get_ic_at_index(ii)
        ic1 = savitzky_golay(ic, points)
        ic_smooth = savitzky_golay(ic1, points)
        ic_base = tophat(ic_smooth, struct="1.5m")
        im.set_ic_at_index(ii, ic_base)

    for mp in sample.get_missing_peaks():

        mp_rt = mp.rt
        common_ion = mp.get_ci()
        qual_ion_1 = float(mp.get_qual_ion1())
        qual_ion_2 = float(mp.get_qual_ion2())

        ci_ion_chrom = im.get_ic_at_mass(common_ion)
        print("ci = ", common_ion)
        qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1)
        print("qi1 = ", qual_ion_1)
        qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2)
        print("qi2 = ", qual_ion_2)
        ######
        # Integrate the CI around that particular RT
        #######

        # Convert time to points
        # How long between scans?

        points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt))
        points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window)
        print("rt_window = ", points_1 - points_2)

        rt_window_points = points_1 - points_2

        maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt,
                                              rt_window_points)

        large_peaks = []

        for rt, intens in maxima_list:
            if intens > threshold:
                q1_index = qi1_ion_chrom.get_index_at_time(rt)
                q2_index = qi2_ion_chrom.get_index_at_time(rt)

                q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index)
                q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index)

                if q1_intensity > threshold / 2 and q2_intensity > threshold / 2:
                    large_peaks.append([rt, intens])

        print(f'found {len(large_peaks):d} peaks above threshold')

        areas = []
        for peak in large_peaks:
            apex = ci_ion_chrom.get_index_at_time(peak[0])
            ia = ci_ion_chrom.get_intensity_array().tolist()
            area, left, right, l_share, r_share = ion_area(ia, apex, 0)
            areas.append(area)

        ########################

        areas.sort()
        if len(areas) > 0:
            biggest_area = areas[-1]
            mp.set_ci_area(biggest_area)
            mp.set_exact_rt(f"{float(mp_rt) / 60.0:.3f}")
            print("found area:", biggest_area, "at rt:", mp_rt)
        else:
            print("Missing peak at rt = ", mp_rt)
            mp.set_ci_area('na')