def call_peaks(im, tic, smooth, args): print "calling peaks" if smooth: print "Smoothing IM first..." im.crop_mass(args.lowmass, args.highmass) print "cropped masses..." # get the size of the intensity matrix n_scan, n_mz = im.get_size() print "# masses in intensity matrix: ", n_mz # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) #print "got ic for mass ", ii # ic1 = savitzky_golay(ic) ic_smooth = savitzky_golay(ic, window=args.window, degree=4) #JT: changed to 4 from 2 #print "savitky golay ran " ic_base = tophat(ic_smooth, struct="1.5m") #print "tophat ran " im.set_ic_at_index(ii, ic_base) #print "smoothed mass ", ii print "smoothed IM..." # noise level calc tic1 = savitzky_golay(tic) tic2 = tophat(tic1, struct="1.5m") #JT: How does struct size work? noise_level = window_analyzer(tic2) print "Noise level in TIC: ", noise_level # get the list of Peak objects using BB peak detection / deconv pl = BillerBiemann(im, args.window, args.scans) print "Initial number of Peaks found:", len(pl) # filter down the peaks. # - First: remove any masses from each peak that have intensity less than r percent of the max intensity in that peak # - Second: remove any peak where there are less than n ions with intensity above the cutoff pl2 = rel_threshold(pl, percent=args.minintensity) pl3 = num_ions_threshold( pl2, n=args.minions, cutoff=100000 ) #100000 for pegBT #200 for peg3 #minions maybe 3 instead of 4? #JT: Was getting very different noise cutoff values so just made it 10^5 # Which was decided on by looking at chromatograms to find baseline noise lvl print "Peaks remaining after filtering:", len(pl3) for peak in pl3: #peak.null_mass(73) #peak.null_mass(207) # column bleed #peak.null_mass(84) # solvent tailing area = peak_sum_area(im, peak) # get the TIC area for this peak peak.set_area(area) area_dict = peak_top_ion_areas( im, peak, args.topions) # get top n ion areas for this peak peak.set_ion_areas(area_dict) return pl3
def call_peaks(im, tic, smooth, args): print "calling peaks" if smooth: print "Smoothing IM first..." im.crop_mass(args.lowmass, args.highmass) print "cropped masses..." # get the size of the intensity matrix n_scan, n_mz = im.get_size() print "# masses in intensity matrix: ", n_mz # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) #print "got ic for mass ", ii # ic1 = savitzky_golay(ic) ic_smooth = savitzky_golay(ic, window=args.window, degree=2) #print "savitky golay ran " ic_base = tophat(ic_smooth, struct="1.5m") #print "tophat ran " im.set_ic_at_index(ii, ic_base) #print "smoothed mass ", ii print "smoothed IM..." # noise level calc tic1 = savitzky_golay(tic) tic2 = tophat(tic1, struct="1.5m") noise_level = window_analyzer(tic2) print "Noise level in TIC: ", noise_level # get the list of Peak objects using BB peak detection / deconv pl = BillerBiemann(im, args.window, args.scans) print "Initial number of Peaks found:", len(pl) # filter down the peaks. # - First: remove any masses from each peak that have intensity less than r percent of the max intensity in that peak # - Second: remove any peak where there are less than n ions with intensity above the cutoff pl2 = rel_threshold(pl, percent=args.minintensity) pl3 = num_ions_threshold(pl2, n=args.minions, cutoff=noise_level * args.noisemult) print "Peaks remaining after filtering:", len(pl3) for peak in pl3: # peak.null_mass(73) peak.null_mass(207) # column bleed peak.null_mass(84) # solvent tailing area = peak_sum_area(im, peak) # get the TIC area for this peak peak.set_area(area) area_dict = peak_top_ion_areas(im, peak, args.topions) # get top n ion areas for this peak peak.set_ion_areas(area_dict) return pl3
def Preprocess_IntensityMatrixes(matrixes): # noise removal and baseline correction of Intensity Matricies #input matrix list, outputs corrected matrix list count = 1 for im in matrixes: n_s, n_mz = im.get_size() count += 1 for ii in range(n_mz): print("Working on IC#", ii + 1, " Unit", count) ic = im.get_ic_at_index(ii) ic_smoof = savitzky_golay(ic) ic_bc = tophat(ic_smoof, struct='1.5m') im.set_ic_at_index(ii, ic_bc) return (matrixes) #save to file
def Preprocess_IntensityMatrices(matrices): """ Baseline correction and smoothing of Intensity Matrices input matrix list, outputs corrected/"cleansed" matrix list @param matrices: List of matrices generated by the matrix_from_cdf method @return: List of matrices that have been baseline corrected & smoothed for peak detection """ count = 1 for im in matrices: n_s, n_mz = im.get_size() count += 1 for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic_smoof = savitzky_golay(ic) ic_bc = tophat(ic_smoof, struct='1.5m') im.set_ic_at_index(ii, ic_bc) return (matrices) # save to file
def Preprocess_IntensityMatrixes(matrixes): ''' noise removal and baseline correction of Intensity Matricies input matrix list, outputs corrected/"cleansed" matrix list @param matrixes: List of matrixes generated by the matrix_from_cdf method @return: List of matrixes that have been 'cleansed' ''' count = 1 for im in matrixes: n_s, n_mz = im.get_size() count += 1 for ii in range(n_mz): # print("Working on IC#", ii+1, " Unit", count) ic = im.get_ic_at_index(ii) ic_smoof = savitzky_golay(ic) ic_bc = tophat(ic_smoof, struct='1.5m') im.set_ic_at_index(ii, ic_bc) # print(matrixes) return (matrixes) # save to file
data = ANDI_reader(andi_file) #data.trim(2431, 2469) # IntensityMatrix # default, float masses with interval (bin interval) of one from min mass print "default intensity matrix, bin interval = 1, boundary +/- 0.5" im = build_intensity_matrix(data) im.null_mass(73) im.null_mass(147) n_scan, n_mz = im.get_size() for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_base = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_base) # Load the experiment exper = load_expr(expr_file) # Load the peak list peak_list = exper.get_peak_list() # Pass Ion Chromatograms into a list of ICs n_mz = len(im.get_mass_list()) ic = [] for m in range(n_mz): ic.append(im.get_ic_at_index(m))
def missing_peak_finder(sample, andi_file, points=7, null_ions=[73, 207],\ crop_ions=[45,300], threshold=100000, rt_window=10): """ @summary: Integrates raw data around missing peak locations to fill in NAs in the data matrix @param sample: The sample object containing missing peaks @type sample: pyms.MissingPeak.Class.Sample @param andi_file: Name of the raw data file @type andi_file: stringType @param points: Peak finding - Peak if maxima over 'points' \ number of scans (Default 3) @type points: intType @param null_ions: Ions to be deleted in the matrix @type null_ions: listType @param crop_ions: Range of Ions to be considered @type crop_ions: listType @param threshold: Minimum intensity of IonChromatogram allowable to fill\ missing peak @type threshold: intType @param rt_window: Window in seconds around average RT to look for \ missing peak @type rt_window: floatType @author: Sean O'Callaghan """ ### some error checks on null and crop ions ### a for root,files,dirs in os.path.walk(): loop print "Sample:", sample.get_name(), "andi_file:", andi_file data = ANDI_reader(andi_file) # build integer intensity matrix im = build_intensity_matrix_i(data) for null_ion in null_ions: im.null_mass(null_ion) im.crop_mass(crop_ions[0], crop_ions[1]) # get the size of the intensity matrix n_scan, n_mz = im.get_size() # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic1 = savitzky_golay(ic, points) ic_smooth = savitzky_golay(ic1, points) ic_base = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_base) for mp in sample.get_missing_peaks(): #JT: Debug peak attributes #attrs = vars(mp) #print ', '.join("%s: %s" % item for item in attrs.items()) mp_rt = mp.get_rt() #print(repr(mp_rt)) common_ion = mp.get_ci() qual_ion_1 = float(mp.get_qual_ion1()) qual_ion_2 = float(mp.get_qual_ion2()) ci_ion_chrom = im.get_ic_at_mass(common_ion) #print "ci = ",common_ion qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1) #print "qi1 = ", qual_ion_1 qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2) #print "qi2 = ", qual_ion_2 ###### # Integrate the CI around that particular RT ####### #Convert time to points # How long between scans? points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt)) points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window) #print "rt_window = ", points_1 - points_2 rt_window_points = points_1 - points_2 maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, \ rt_window_points) large_peaks = [] for rt, intens in maxima_list: if intens > threshold: q1_index = qi1_ion_chrom.get_index_at_time(rt) q2_index = qi2_ion_chrom.get_index_at_time(rt) q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index) q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index) if q1_intensity > threshold / 2 and q2_intensity > threshold / 2: large_peaks.append([rt, intens]) #print('found %d peaks above threshold'%len(large_peaks)) areas = [] for peak in large_peaks: apex = ci_ion_chrom.get_index_at_time(peak[0]) ia = ci_ion_chrom.get_intensity_array().tolist() area, left, fight, l_share, r_share = ion_area(ia, apex, 0) areas.append(area) ######################## areas.sort() if len(areas) > 0: biggest_area = areas[-1] mp.set_ci_area(biggest_area) #print "found area:", biggest_area, "at rt:", mp_rt else: #print "Missing peak at rt = ", mp_rt mp.set_ci_area('NA')
# read the raw data as a GCMS_data object andi_file = "/x/PyMS/data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) im = build_intensity_matrix_i(data) n_scan, n_mz = im.get_size() print "Intensity matrix size (scans, masses):", (n_scan, n_mz) # noise filter and baseline correct for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_bc) # Use Biller and Biemann technique to find apexing ions at a scan # default is maxima over three scans and not to combine with any neighbouring # scan. peak_list = BillerBiemann(im) print "Number of peaks found: ", len(peak_list) # Find apex oven 9 points and combine with neighbouring peak if two scans apex # next to each other. peak_list = BillerBiemann(im, points=9, scans=2) print "Number of peaks found: ", len(peak_list)
# read in raw data andi_file = "/x/PyMS/data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) data.trim(4101, 4350) # Build Intensity Matrix real_im = build_intensity_matrix_i(data) n_scan, n_mz = real_im.get_size() # perform necessary pre filtering for ii in range(n_mz): ic = real_im.get_ic_at_index(ii) ic_smooth = savitzky_golay(ic) ic_bc = tophat(ic_smooth, struct="1.5m") real_im.set_ic_at_index(ii, ic_bc) # Detect Peaks peak_list = BillerBiemann(real_im, points=3, scans=2) print "Number of peaks found in real data: ", len(peak_list) ######### Filter peaks############### # Filter the peak list, # first by removing all intensities in a peak less than a given relative # threshold, # then by removing all peaks that have less than a given number of ions above # a given value # Parameters
"""proc.py """ import sys sys.path.append("/x/PyMS/") from pyms.GCMS.IO.ANDI.Function import ANDI_reader from pyms.Noise.SavitzkyGolay import savitzky_golay from pyms.Baseline.TopHat import tophat # read the raw data andi_file = "/x/PyMS/data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) # get the TIC tic = data.get_tic() # apply noise smoothing and baseline correction tic1 = savitzky_golay(tic) tic2 = tophat(tic1, struct="1.5m") # save smoothed/baseline corrected TIC tic.write("output/tic.dat", minutes=True) tic1.write("output/tic_smooth.dat", minutes=True) tic2.write("output/tic_smooth_bc.dat", minutes=True)
"""proc.py """ import sys sys.path.append("/x/PyMS/") from pyms.GCMS.IO.ANDI.Function import ANDI_reader from pyms.Noise.SavitzkyGolay import savitzky_golay from pyms.Baseline.TopHat import tophat # read the raw data andi_file = "/x/PyMS/data/gc01_0812_066.cdf" data = ANDI_reader(andi_file) # get the TIC tic = data.get_tic() # apply noise smoothing and baseline correction tic1 = savitzky_golay(tic) tic2 = tophat(tic1, struct="1.5m") # save smoothed/baseline corrected TIC tic.write("output/tic.dat",minutes=True) tic1.write("output/tic_smooth.dat",minutes=True) tic2.write("output/tic_smooth_bc.dat",minutes=True)
def missing_peak_finder(sample, filename, points=13, null_ions=[73, 147],\ crop_ions=[50,540], threshold=1000, rt_window=1, filetype='cdf'): """ @summary: Integrates raw data around missing peak locations to fill in NAs in the data matrix @param sample: The sample object containing missing peaks @type sample: pyms.MissingPeak.Class.Sample @param andi_file: Name of the raw data file @type andi_file: stringType @param points: Peak finding - Peak if maxima over 'points' \ number of scans (Default 3) @type points: intType @param null_ions: Ions to be deleted in the matrix @type null_ions: listType @param crop_ions: Range of Ions to be considered @type crop_ions: listType @param threshold: Minimum intensity of IonChromatogram allowable to fill\ missing peak @type threshold: intType @param rt_window: Window in seconds around average RT to look for \ missing peak @type rt_window: floatType @author: Sean O'Callaghan """ ### some error checks on null and crop ions ### a for root,files,dirs in os.path.walk(): loop print "Sample:", sample.get_name(), "File:", filename if filetype == 'cdf': data = ANDI_reader(filename) elif filetype == 'mzml': data = mzML_reader(filename) else: print "file type not valid" # build integer intensity matrix im = build_intensity_matrix_i(data) for null_ion in null_ions: im.null_mass(null_ion) im.crop_mass(crop_ions[0], crop_ions[1]) # get the size of the intensity matrix n_scan, n_mz = im.get_size() # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic1 = savitzky_golay(ic, points) ic_smooth = savitzky_golay(ic1, points) ic_base = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_base) for mp in sample.get_missing_peaks(): mp_rt = mp.get_rt() common_ion = mp.get_ci() qual_ion_1 = float(mp.get_qual_ion1()) qual_ion_2 = float(mp.get_qual_ion2()) ci_ion_chrom = im.get_ic_at_mass(common_ion) print "ci = ",common_ion qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1) print "qi1 = ", qual_ion_1 qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2) print "qi2 = ", qual_ion_2 ###### # Integrate the CI around that particular RT ####### #Convert time to points # How long between scans? points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt)) points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt)-rt_window) print "rt_window = ", points_1 - points_2 rt_window_points = points_1 - points_2 maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, \ rt_window_points) large_peaks = [] for rt, intens in maxima_list: if intens > threshold: q1_index = qi1_ion_chrom.get_index_at_time(rt) q2_index = qi2_ion_chrom.get_index_at_time(rt) q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index) q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index) if q1_intensity > threshold/2 and q2_intensity > threshold/2: large_peaks.append([rt, intens]) print('found %d peaks above threshold'%len(large_peaks)) areas = [] for peak in large_peaks: apex = ci_ion_chrom.get_index_at_time(peak[0]) ia = ci_ion_chrom.get_intensity_array().tolist() area, left, fight, l_share, r_share = ion_area(ia, apex, 0) areas.append(area) ######################## areas.sort() if len(areas)>0: biggest_area = areas[-1] mp.set_ci_area(biggest_area) print "found area:", biggest_area, "at rt:", mp_rt else: print "Missing peak at rt = ", mp_rt mp.set_ci_area('na')
# define the names of the peak file and the corresponding ANDI-MS file andi_file = os.path.join(base_path, expr_code + ".cdf") data = ANDI_reader(andi_file) im = build_intensity_matrix_i(data) # get the size of the intensity matrix n_scan, n_mz = im.get_size() # smooth data for ii in range(n_mz): ic = im.get_ic_at_index(ii) ic1 = savitzky_golay(ic) ic_smooth = savitzky_golay(ic1) ic_base = tophat(ic_smooth, struct="1.5m") im.set_ic_at_index(ii, ic_base) # do peak detection on pre-trimmed data # get the list of Peak objects pl = BillerBiemann(im, points, scans) # trim by relative intensity apl = rel_threshold(pl, r) # trim by threshold peak_list = num_ions_threshold(apl, n, t) print "\t -> Number of Peaks found:", len(peak_list)