Beispiel #1
0
"""proc.py
"""

# TODO: mzML demo; need example mzML file

import pathlib
data_directory = pathlib.Path(".").resolve().parent.parent / "pyms-data"
# Change this if the data files are stored in a different location

from pyms.GCMS.IO.MZML import mzML_reader

# read the raw data
mzml_file = data_directory / ".mzML"
data = mzML_reader(mzml_file)
print(data)

# raw data operations
print("minimum mass found in all data: ", data.min_mass)
print("maximum mass found in all data: ", data.max_mass)

# time
time = data.time_list
print(time)
print("number of retention times: ", len(time))
print("retention time of 1st scan: ", time[0], "sec")
print("index of 400sec in time_list: ", data.get_index_at_time(400.0))

# TIC
tic = data.tic
print(tic)
print("number of scans in TIC: ", len(tic))
Beispiel #2
0
def missing_peak_finder(
    sample: Sample,
    file_name: str,
    points: int = 3,
    null_ions: Optional[List] = None,
    crop_ions: Optional[List] = None,
    threshold: int = 1000,
    rt_window: float = 1,
    filetype: MissingPeakFiletype = MZML,
):
    r"""
	Integrates raw data around missing peak locations to fill ``NA``\s in the data matrix.

	:param sample: The sample object containing missing peaks
	:param file_name: Name of the raw data file
	:param points: Peak finding - Peak if maxima over 'points' number of scans.
	:param null_ions: Ions to be deleted in the matrix.
	:default null_ions: ``[73, 147]``
	:param crop_ions: Range of Ions to be considered.
	:default crop_ions: ``[50, 540]``
	:param threshold: Minimum intensity of IonChromatogram allowable to fill.
	:param rt_window: Window in seconds around average RT to look for.
	:param filetype:

	:author: Sean O'Callaghan
	"""

    if not null_ions:
        null_ions = [73, 147]
    if not crop_ions:
        crop_ions = [50, 540]

    # TODO: some error checks on null and crop ions

    # TODO: a for root,files,dirs in os.path.walk(): loop
    print("Sample:", sample.name, "File:", file_name)

    if filetype == NETCDF:
        # this package
        from pyms.GCMS.IO.ANDI import ANDI_reader
        data = ANDI_reader(file_name)

    elif filetype == MZML:
        # this package
        from pyms.GCMS.IO.MZML import mzML_reader
        data = mzML_reader(file_name)

    else:
        print("file type not valid")

    # build integer intensity matrix
    im = build_intensity_matrix_i(data)

    for null_ion in null_ions:
        im.null_mass(null_ion)

    im.crop_mass(crop_ions[0], crop_ions[1])

    # get the size of the intensity matrix
    n_scan, n_mz = im.size

    # smooth data
    for ii in range(n_mz):
        ic = im.get_ic_at_index(ii)
        ic1 = savitzky_golay(ic, points)
        ic_smooth = savitzky_golay(ic1, points)
        ic_base = tophat(ic_smooth, struct="1.5m")
        im.set_ic_at_index(ii, ic_base)

    for mp in sample.missing_peaks:

        mp_rt = mp.rt
        common_ion = mp.common_ion
        qual_ion_1 = float(mp.qual_ion1)
        qual_ion_2 = float(mp.qual_ion2)

        ci_ion_chrom = im.get_ic_at_mass(common_ion)
        print("ci = ", common_ion)
        qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1)
        print("qi1 = ", qual_ion_1)
        qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2)
        print("qi2 = ", qual_ion_2)
        ######
        # Integrate the CI around that particular RT
        #######

        # Convert time to points
        # How long between scans?

        points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt))
        points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window)
        print("rt_window = ", points_1 - points_2)

        rt_window_points = points_1 - points_2

        maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt,
                                              rt_window_points)

        large_peaks = []

        for rt, intens in maxima_list:
            if intens > threshold:
                q1_index = qi1_ion_chrom.get_index_at_time(rt)
                q2_index = qi2_ion_chrom.get_index_at_time(rt)

                q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index)
                q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index)

                if q1_intensity > threshold / 2 and q2_intensity > threshold / 2:
                    large_peaks.append([rt, intens])

        print(f"found {len(large_peaks):d} peaks above threshold")

        areas = []
        for peak in large_peaks:
            apex = ci_ion_chrom.get_index_at_time(peak[0])
            ia = ci_ion_chrom.intensity_array.tolist()
            area, left, right, l_share, r_share = ion_area(ia, apex, 0)
            areas.append(area)

        ########################

        areas.sort()
        if len(areas) > 0:
            biggest_area = areas[-1]
            mp.common_ion_area = biggest_area
            # mp.exact_rt = f"{float(mp_rt) / 60.0:.3f}"
            mp.exact_rt = float(mp_rt) / 60.0
            print("found area:", biggest_area, "at rt:", mp_rt)
        else:
            print("Missing peak at rt = ", mp_rt)
            mp.common_ion_area = None
Beispiel #3
0
    def run(self, original_filename, original_filetype):
        """
		Load the original data from the given datafile and perform quantitative analysis
		
		:param original_filename:
		:type original_filename:
		:param original_filetype:
		:type original_filetype:
		"""

        self.original_filename = str(original_filename)
        self.original_filetype = int(original_filetype)

        print("Quantitative Processing in Progress...")

        # TODO: Include data etc. in experiment file

        if self.original_filetype == ID_Format_jcamp:
            # Load data using JCAMP_reader
            from pyms.GCMS.IO.JCAMP import JCAMP_reader
            self.gcms_data = JCAMP_reader(self.original_filename)

        elif self.original_filetype == ID_Format_mzML:
            # Load data using JCAMP_reader
            from pyms.GCMS.IO.MZML import mzML_reader
            self.gcms_data = mzML_reader(self.original_filename)

        elif self.original_filetype == ID_Format_ANDI:
            # Load data using JCAMP_reader
            from pyms.GCMS.IO.ANDI import ANDI_reader
            self.gcms_data = ANDI_reader(self.original_filename)

        else:
            # Unknown Format
            return
        # TODO: Waters RAW, Thermo RAW, Agilent .d

        method = Method.Method(self.method.value)

        # list of all retention times, in seconds
        # times = self.gcms_data.get_time_list()
        # get Total Ion Chromatogram
        self.tic = self.gcms_data.get_tic()
        # RT Range, time step, no. scans, min, max, mean and median m/z
        self.gcms_data.info()

        self.get_info_from_gcms_data()

        # Build "intensity matrix" by binning data with integer bins and a
        # 	window of -0.3 to +0.7, the same as NIST uses
        self.intensity_matrix = build_intensity_matrix_i(self.gcms_data)

        # Show the m/z of the maximum and minimum bins
        print(" Minimum m/z bin: {}".format(
            self.intensity_matrix.get_min_mass()))
        print(" Maximum m/z bin: {}".format(
            self.intensity_matrix.get_max_mass()))

        # Crop masses
        min_mass, max_mass, *_ = method.mass_range

        if min_mass < self.intensity_matrix.get_min_mass():
            min_mass = self.intensity_matrix.get_min_mass()
        if max_mass > self.intensity_matrix.get_max_mass():
            max_mass = self.intensity_matrix.get_max_mass()
        self.intensity_matrix.crop_mass(min_mass, max_mass)

        # Perform Data filtering
        n_scan, n_mz = self.intensity_matrix.get_size()

        # Iterate over each IC in the intensity matrix
        for ii in range(n_mz):
            # print("\rWorking on IC#", ii+1, '  ',end='')
            ic = self.intensity_matrix.get_ic_at_index(ii)

            if method.expr_creation_enable_sav_gol:
                # Perform Savitzky-Golay smoothing.
                # Note that Turbomass does not use smoothing for qualitative method.
                ic = savitzky_golay(ic)

            if method.expr_creation_enable_tophat:
                # Perform Tophat baseline correction
                # Top-hat baseline Correction seems to bring down noise,
                #  		retaining shapes, but keeps points on actual peaks
                ic = tophat(ic, struct=method.tophat_struct)

            # Set the IC in the intensity matrix to the filtered one
            self.intensity_matrix.set_ic_at_index(ii, ic)

        # Peak Detection based on Biller and Biemann (1974), with a window
        # 	of <points>, and combining <scans> if they apex next to each other
        peak_list = BillerBiemann(
            self.intensity_matrix,
            points=method.expr_creation_bb_points,
            scans=method.expr_creation_bb_scans,
        )

        print(" Number of peaks identified before filtering: {}".format(
            len(peak_list)))

        if method.expr_creation_enable_noise_filter:
            # Filtering peak lists with automatic noise filtering
            noise_level = window_analyzer(self.tic)
            # should we also do rel_threshold() here?
            # https://pymassspec.readthedocs.io/en/master/pyms/BillerBiemann.html#pyms.BillerBiemann.rel_threshold
            peak_list = num_ions_threshold(peak_list,
                                           method.expr_creation_noise_thresh,
                                           noise_level)

        self.peak_list = []

        for peak_idx, peak in enumerate(peak_list):
            # Get mass and intensity lists for the mass spectrum at the apex of the peak
            apex_mass_list = peak.mass_spectrum.mass_list
            apex_mass_spec = peak.mass_spectrum.mass_spec

            # Determine the intensity of the base peak in the mass spectrum
            base_peak_intensity = max(apex_mass_spec)

            # Determine the index of the base peak in the mass spectrum
            base_peak_index = [
                index for index, intensity in enumerate(apex_mass_spec)
                if intensity == base_peak_intensity
            ][0]

            # Finally, determine the mass of the base peak
            base_peak_mass = apex_mass_list[base_peak_index]

            # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed
            if base_peak_mass in method.base_peak_filter:
                continue

            area = peak_sum_area(self.intensity_matrix, peak)
            peak.set_area(area)
            self.peak_list.append(peak)

        print(" Number of peaks identified: {}".format(len(self.peak_list)))

        # Create an experiment
        self.expr = pyms.Experiment.Experiment(self.name, self.peak_list)
        self.expr.sele_rt_range([
            "{}m".format(method.target_range[0]),
            "{}m".format(method.target_range[1])
        ])