# In[12]: expr_codes = ["a0806_077", "a0806_078", "a0806_079"] # expr_codes = ["a0806_140", "a0806_141", "a0806_142"] # Loop over the experiments and perform the processing. # In[13]: for expr_code in expr_codes: print(f" -> Processing experiment '{expr_code}'") andi_file = data_directory / f"{expr_code}.cdf" data = ANDI_reader(andi_file) im = build_intensity_matrix_i(data) n_scan, n_mz = im.size # Preprocess the data (Savitzky-Golay smoothing and Tophat baseline detection) for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic1 = savitzky_golay(ic) ic_smooth = savitzky_golay(ic1) # Why the second pass here? ic_bc = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_bc) # Peak detection
def andi(datadir): print("data") return ANDI_reader(datadir / "gc01_0812_066.cdf")
from pyms.Display import Display from pyms.GCMS.IO.ANDI import ANDI_reader from pyms.IntensityMatrix import build_intensity_matrix_i from pyms.Noise.SavitzkyGolay import savitzky_golay from pyms.Peak.Function import peak_sum_area from pyms.Simulator import gcms_sim from pyms.TopHat import tophat data_directory = pathlib.Path(".").resolve().parent.parent / "pyms-data" # Change this if the data files are stored in a different location output_directory = pathlib.Path(".").resolve() / "output" # read raw data andi_file = data_directory / "data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) data.trim(4101, 4350) # Build Intensity Matrix real_im = build_intensity_matrix_i(data) n_scan, n_mz = real_im.size # perform necessary pre filtering for ii in range(n_mz): ic = real_im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") real_im.set_ic_at_index(ii, ic_bc)
import pathlib data_directory = pathlib.Path(".").resolve().parent.parent / "pyms-data" # Change this if the data files are stored in a different location output_directory = pathlib.Path(".").resolve() / "output" from pyms.GCMS.IO.ANDI import ANDI_reader # Read the raw ANDI-MS data # In[2]: andi_file = data_directory / "gc01_0812_066.cdf" data = ANDI_reader(andi_file) print(data) # ### A GCMS_data Object # # The object ``data`` (from the two previous examples) stores the raw data as a # |pyms.GCMS.Class.GCMS_data| object. # Within the |GCMS_data| # object, raw data are stored as a list of # |pyms.Spectrum.Scan| objects and a list of # retention times. There are several methods available to access data and # attributes of the |GCMS_data| # and |Scan| objects. # # The |GCMS_data| object's methods relate to the raw data. # The main properties relate to the masses, retention times and scans. For example, the
"""proc.py """ # This file has been replaced by jupyter/reading_andi.ipynb import pathlib data_directory = pathlib.Path(".").resolve().parent.parent / "pyms-data" # Change this if the data files are stored in a different location from pyms.GCMS.IO.ANDI import ANDI_reader # read the raw data andi_file = data_directory / "gc01_0812_066.cdf" data = ANDI_reader(andi_file) # print info data.info() # write data to output file. This will create # two ascii data tables, data.I.csv and data.mz.csv # with intensities and m/z values data.write("output/data")
"""proc.py """ # This file has been replaced by jupyter/NoiseSmoothing.ipynb from pyms.GCMS.IO.ANDI import ANDI_reader from pyms.Noise.SavitzkyGolay import savitzky_golay # read the raw data andi_file = "data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) # get the TIC tic = data.get_tic() tic1 = savitzky_golay(tic) tic.write("output/tic.dat", minutes=True) tic1.write("output/tic1.dat", minutes=True)
def missing_peak_finder(sample, file_name, points=3, null_ions=None, crop_ions=None, threshold=1000, rt_window=1, filetype=MZML): """ Integrates raw data around missing peak locations to fill NAs in the data matrix :param sample: The sample object containing missing peaks :type sample: :class:`pyms.Gapfill.Class.Sample` :param file_name: Name of the raw data file :type file_name: str :param points: Peak finding - Peak if maxima over 'points' number of scans. Default ``3`` :type points: int, optional :param null_ions: Ions to be deleted in the matrix. Default ``[73, 147]`` :type null_ions: list, optional :param crop_ions: Range of Ions to be considered. Default ``[50, 540]`` :type crop_ions: list, optional :param threshold: Minimum intensity of IonChromatogram allowable to fill. Default ``1000`` :type threshold: int, optional :param rt_window: Window in seconds around average RT to look for. Default ``1`` :type rt_window: float, optional :param filetype: either `MZML` (default) or `NETCDF` :type filetype: int, optional :author: Sean O'Callaghan """ if not null_ions: null_ions = [73, 147] if not crop_ions: crop_ions = [50, 540] # TODO: some error checks on null and crop ions # TODO: a for root,files,dirs in os.path.walk(): loop print("Sample:", sample.get_name(), "File:", file_name) if filetype.lower() == 'cdf': from pyms.GCMS.IO.ANDI import ANDI_reader data = ANDI_reader(file_name) elif filetype.lower() == 'mzml': from pyms.GCMS.IO.MZML import mzML_reader data = mzML_reader(file_name) else: print("file type not valid") # build integer intensity matrix im = build_intensity_matrix_i(data) for null_ion in null_ions: im.null_mass(null_ion) im.crop_mass(crop_ions[0], crop_ions[1]) # get the size of the intensity matrix n_scan, n_mz = im.size # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic1 = savitzky_golay(ic, points) ic_smooth = savitzky_golay(ic1, points) ic_base = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_base) for mp in sample.get_missing_peaks(): mp_rt = mp.rt common_ion = mp.get_ci() qual_ion_1 = float(mp.get_qual_ion1()) qual_ion_2 = float(mp.get_qual_ion2()) ci_ion_chrom = im.get_ic_at_mass(common_ion) print("ci = ", common_ion) qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1) print("qi1 = ", qual_ion_1) qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2) print("qi2 = ", qual_ion_2) ###### # Integrate the CI around that particular RT ####### # Convert time to points # How long between scans? points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt)) points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window) print("rt_window = ", points_1 - points_2) rt_window_points = points_1 - points_2 maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, rt_window_points) large_peaks = [] for rt, intens in maxima_list: if intens > threshold: q1_index = qi1_ion_chrom.get_index_at_time(rt) q2_index = qi2_ion_chrom.get_index_at_time(rt) q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index) q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index) if q1_intensity > threshold / 2 and q2_intensity > threshold / 2: large_peaks.append([rt, intens]) print(f'found {len(large_peaks):d} peaks above threshold') areas = [] for peak in large_peaks: apex = ci_ion_chrom.get_index_at_time(peak[0]) ia = ci_ion_chrom.get_intensity_array().tolist() area, left, right, l_share, r_share = ion_area(ia, apex, 0) areas.append(area) ######################## areas.sort() if len(areas) > 0: biggest_area = areas[-1] mp.set_ci_area(biggest_area) mp.set_exact_rt(f"{float(mp_rt) / 60.0:.3f}") print("found area:", biggest_area, "at rt:", mp_rt) else: print("Missing peak at rt = ", mp_rt) mp.set_ci_area('na')