"""proc.py """ # TODO: mzML demo; need example mzML file import pathlib data_directory = pathlib.Path(".").resolve().parent.parent / "pyms-data" # Change this if the data files are stored in a different location from pyms.GCMS.IO.MZML import mzML_reader # read the raw data mzml_file = data_directory / ".mzML" data = mzML_reader(mzml_file) print(data) # raw data operations print("minimum mass found in all data: ", data.min_mass) print("maximum mass found in all data: ", data.max_mass) # time time = data.time_list print(time) print("number of retention times: ", len(time)) print("retention time of 1st scan: ", time[0], "sec") print("index of 400sec in time_list: ", data.get_index_at_time(400.0)) # TIC tic = data.tic print(tic) print("number of scans in TIC: ", len(tic))
def missing_peak_finder( sample: Sample, file_name: str, points: int = 3, null_ions: Optional[List] = None, crop_ions: Optional[List] = None, threshold: int = 1000, rt_window: float = 1, filetype: MissingPeakFiletype = MZML, ): r""" Integrates raw data around missing peak locations to fill ``NA``\s in the data matrix. :param sample: The sample object containing missing peaks :param file_name: Name of the raw data file :param points: Peak finding - Peak if maxima over 'points' number of scans. :param null_ions: Ions to be deleted in the matrix. :default null_ions: ``[73, 147]`` :param crop_ions: Range of Ions to be considered. :default crop_ions: ``[50, 540]`` :param threshold: Minimum intensity of IonChromatogram allowable to fill. :param rt_window: Window in seconds around average RT to look for. :param filetype: :author: Sean O'Callaghan """ if not null_ions: null_ions = [73, 147] if not crop_ions: crop_ions = [50, 540] # TODO: some error checks on null and crop ions # TODO: a for root,files,dirs in os.path.walk(): loop print("Sample:", sample.name, "File:", file_name) if filetype == NETCDF: # this package from pyms.GCMS.IO.ANDI import ANDI_reader data = ANDI_reader(file_name) elif filetype == MZML: # this package from pyms.GCMS.IO.MZML import mzML_reader data = mzML_reader(file_name) else: print("file type not valid") # build integer intensity matrix im = build_intensity_matrix_i(data) for null_ion in null_ions: im.null_mass(null_ion) im.crop_mass(crop_ions[0], crop_ions[1]) # get the size of the intensity matrix n_scan, n_mz = im.size # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic1 = savitzky_golay(ic, points) ic_smooth = savitzky_golay(ic1, points) ic_base = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_base) for mp in sample.missing_peaks: mp_rt = mp.rt common_ion = mp.common_ion qual_ion_1 = float(mp.qual_ion1) qual_ion_2 = float(mp.qual_ion2) ci_ion_chrom = im.get_ic_at_mass(common_ion) print("ci = ", common_ion) qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1) print("qi1 = ", qual_ion_1) qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2) print("qi2 = ", qual_ion_2) ###### # Integrate the CI around that particular RT ####### # Convert time to points # How long between scans? points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt)) points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window) print("rt_window = ", points_1 - points_2) rt_window_points = points_1 - points_2 maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, rt_window_points) large_peaks = [] for rt, intens in maxima_list: if intens > threshold: q1_index = qi1_ion_chrom.get_index_at_time(rt) q2_index = qi2_ion_chrom.get_index_at_time(rt) q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index) q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index) if q1_intensity > threshold / 2 and q2_intensity > threshold / 2: large_peaks.append([rt, intens]) print(f"found {len(large_peaks):d} peaks above threshold") areas = [] for peak in large_peaks: apex = ci_ion_chrom.get_index_at_time(peak[0]) ia = ci_ion_chrom.intensity_array.tolist() area, left, right, l_share, r_share = ion_area(ia, apex, 0) areas.append(area) ######################## areas.sort() if len(areas) > 0: biggest_area = areas[-1] mp.common_ion_area = biggest_area # mp.exact_rt = f"{float(mp_rt) / 60.0:.3f}" mp.exact_rt = float(mp_rt) / 60.0 print("found area:", biggest_area, "at rt:", mp_rt) else: print("Missing peak at rt = ", mp_rt) mp.common_ion_area = None
def run(self, original_filename, original_filetype): """ Load the original data from the given datafile and perform quantitative analysis :param original_filename: :type original_filename: :param original_filetype: :type original_filetype: """ self.original_filename = str(original_filename) self.original_filetype = int(original_filetype) print("Quantitative Processing in Progress...") # TODO: Include data etc. in experiment file if self.original_filetype == ID_Format_jcamp: # Load data using JCAMP_reader from pyms.GCMS.IO.JCAMP import JCAMP_reader self.gcms_data = JCAMP_reader(self.original_filename) elif self.original_filetype == ID_Format_mzML: # Load data using JCAMP_reader from pyms.GCMS.IO.MZML import mzML_reader self.gcms_data = mzML_reader(self.original_filename) elif self.original_filetype == ID_Format_ANDI: # Load data using JCAMP_reader from pyms.GCMS.IO.ANDI import ANDI_reader self.gcms_data = ANDI_reader(self.original_filename) else: # Unknown Format return # TODO: Waters RAW, Thermo RAW, Agilent .d method = Method.Method(self.method.value) # list of all retention times, in seconds # times = self.gcms_data.get_time_list() # get Total Ion Chromatogram self.tic = self.gcms_data.get_tic() # RT Range, time step, no. scans, min, max, mean and median m/z self.gcms_data.info() self.get_info_from_gcms_data() # Build "intensity matrix" by binning data with integer bins and a # window of -0.3 to +0.7, the same as NIST uses self.intensity_matrix = build_intensity_matrix_i(self.gcms_data) # Show the m/z of the maximum and minimum bins print(" Minimum m/z bin: {}".format( self.intensity_matrix.get_min_mass())) print(" Maximum m/z bin: {}".format( self.intensity_matrix.get_max_mass())) # Crop masses min_mass, max_mass, *_ = method.mass_range if min_mass < self.intensity_matrix.get_min_mass(): min_mass = self.intensity_matrix.get_min_mass() if max_mass > self.intensity_matrix.get_max_mass(): max_mass = self.intensity_matrix.get_max_mass() self.intensity_matrix.crop_mass(min_mass, max_mass) # Perform Data filtering n_scan, n_mz = self.intensity_matrix.get_size() # Iterate over each IC in the intensity matrix for ii in range(n_mz): # print("\rWorking on IC#", ii+1, ' ',end='') ic = self.intensity_matrix.get_ic_at_index(ii) if method.expr_creation_enable_sav_gol: # Perform Savitzky-Golay smoothing. # Note that Turbomass does not use smoothing for qualitative method. ic = savitzky_golay(ic) if method.expr_creation_enable_tophat: # Perform Tophat baseline correction # Top-hat baseline Correction seems to bring down noise, # retaining shapes, but keeps points on actual peaks ic = tophat(ic, struct=method.tophat_struct) # Set the IC in the intensity matrix to the filtered one self.intensity_matrix.set_ic_at_index(ii, ic) # Peak Detection based on Biller and Biemann (1974), with a window # of <points>, and combining <scans> if they apex next to each other peak_list = BillerBiemann( self.intensity_matrix, points=method.expr_creation_bb_points, scans=method.expr_creation_bb_scans, ) print(" Number of peaks identified before filtering: {}".format( len(peak_list))) if method.expr_creation_enable_noise_filter: # Filtering peak lists with automatic noise filtering noise_level = window_analyzer(self.tic) # should we also do rel_threshold() here? # https://pymassspec.readthedocs.io/en/master/pyms/BillerBiemann.html#pyms.BillerBiemann.rel_threshold peak_list = num_ions_threshold(peak_list, method.expr_creation_noise_thresh, noise_level) self.peak_list = [] for peak_idx, peak in enumerate(peak_list): # Get mass and intensity lists for the mass spectrum at the apex of the peak apex_mass_list = peak.mass_spectrum.mass_list apex_mass_spec = peak.mass_spectrum.mass_spec # Determine the intensity of the base peak in the mass spectrum base_peak_intensity = max(apex_mass_spec) # Determine the index of the base peak in the mass spectrum base_peak_index = [ index for index, intensity in enumerate(apex_mass_spec) if intensity == base_peak_intensity ][0] # Finally, determine the mass of the base peak base_peak_mass = apex_mass_list[base_peak_index] # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed if base_peak_mass in method.base_peak_filter: continue area = peak_sum_area(self.intensity_matrix, peak) peak.set_area(area) self.peak_list.append(peak) print(" Number of peaks identified: {}".format(len(self.peak_list))) # Create an experiment self.expr = pyms.Experiment.Experiment(self.name, self.peak_list) self.expr.sele_rt_range([ "{}m".format(method.target_range[0]), "{}m".format(method.target_range[1]) ])