Esempio n. 1
0
def missing_peak_finder(sample, filename, points=13, null_ions=[73, 147],\
                            crop_ions=[50,540], threshold=1000, rt_window=1, filetype='mzml'):
    """
    @summary: Integrates raw data around missing peak locations
              to fill in NAs in the data matrix

    @param  sample: The sample object containing missing peaks
    @type sample: pyms.MissingPeak.Class.Sample

    @param  andi_file: Name of the raw data file
    @type andi_file: stringType

    @param  points: Peak finding - Peak if maxima over 'points' \
                    number of scans (Default 3) 
    @type points: intType

    @param  null_ions: Ions to be deleted in the matrix
    @type null_ions: listType

    @param crop_ions: Range of Ions to be considered
    @type crop_ions: listType 

    @param threshold: Minimum intensity of IonChromatogram allowable to fill\
                      missing peak
    @type threshold: intType

    @param  rt_window: Window in seconds around average RT to look for \
                       missing peak
    @type rt_window: floatType

    @param filetype: either mzml or netcdf
    @type filetype: stringType

    @author: Sean O'Callaghan
    """

    ### some error checks on null and crop ions

    ### a for root,files,dirs in os.path.walk(): loop
    print "Sample:", sample.get_name(), "File:", filename

    if filetype.lower() == 'cdf':
        data = ANDI_reader(filename)
    elif filetype.lower() == 'mzml':
        data = mzML_reader(filename)
    else:
        print "file type not valid"

    # build integer intensity matrix
    im = build_intensity_matrix_i(data)

    for null_ion in null_ions:
        im.null_mass(null_ion)

    im.crop_mass(crop_ions[0], crop_ions[1])

    # get the size of the intensity matrix
    n_scan, n_mz = im.get_size()

    # smooth data
    for ii in range(n_mz):
        ic = im.get_ic_at_index(ii)
        ic1 = savitzky_golay(ic, points)
        ic_smooth = savitzky_golay(ic1, points)
        ic_base = tophat(ic_smooth, struct="1.5m")
        im.set_ic_at_index(ii, ic_base)

    for mp in sample.get_missing_peaks():

        mp_rt = mp.get_rt()
        common_ion = mp.get_ci()
        qual_ion_1 = float(mp.get_qual_ion1())
        qual_ion_2 = float(mp.get_qual_ion2())

        ci_ion_chrom = im.get_ic_at_mass(common_ion)
        print "ci = ", common_ion
        qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1)
        print "qi1 = ", qual_ion_1
        qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2)
        print "qi2 = ", qual_ion_2
        ######
        # Integrate the CI around that particular RT
        #######

        #Convert time to points
        # How long between scans?

        points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt))
        points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window)
        print "rt_window = ", points_1 - points_2

        rt_window_points = points_1 - points_2

        maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, \
                                                  rt_window_points)

        large_peaks = []

        for rt, intens in maxima_list:
            if intens > threshold:
                q1_index = qi1_ion_chrom.get_index_at_time(rt)
                q2_index = qi2_ion_chrom.get_index_at_time(rt)

                q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index)
                q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index)

                if q1_intensity > threshold / 2 and q2_intensity > threshold / 2:
                    large_peaks.append([rt, intens])

        print('found %d peaks above threshold' % len(large_peaks))

        areas = []
        for peak in large_peaks:
            apex = ci_ion_chrom.get_index_at_time(peak[0])
            ia = ci_ion_chrom.get_intensity_array().tolist()
            area, left, right, l_share, r_share = ion_area(ia, apex, 0)
            areas.append(area)
        ########################
        areas.sort()
        if len(areas) > 0:
            biggest_area = areas[-1]
            mp.set_ci_area(biggest_area)
            mp.set_exact_rt("{:.3f}".format(float(mp_rt) / 60.0))
            print "found area:", biggest_area, "at rt:", mp_rt
        else:
            print "Missing peak at rt = ", mp_rt
            mp.set_ci_area('na')
Esempio n. 2
0
def missing_peak_finder(sample, filename, points=13, null_ions=[73, 147],\
                            crop_ions=[50,540], threshold=1000, rt_window=1, filetype='cdf'):
    """
    @summary: Integrates raw data around missing peak locations
              to fill in NAs in the data matrix

    @param  sample: The sample object containing missing peaks
    @type sample: pyms.MissingPeak.Class.Sample

    @param  andi_file: Name of the raw data file
    @type andi_file: stringType

    @param  points: Peak finding - Peak if maxima over 'points' \
                    number of scans (Default 3) 
    @type points: intType

    @param  null_ions: Ions to be deleted in the matrix
    @type null_ions: listType

    @param crop_ions: Range of Ions to be considered
    @type crop_ions: listType 

    @param threshold: Minimum intensity of IonChromatogram allowable to fill\
                      missing peak
    @type threshold: intType

    @param  rt_window: Window in seconds around average RT to look for \
                       missing peak
    @type rt_window: floatType

    @author: Sean O'Callaghan
    """

    ### some error checks on null and crop ions

    ### a for root,files,dirs in os.path.walk(): loop
    print "Sample:", sample.get_name(), "File:", filename
    
    if filetype == 'cdf':
        data = ANDI_reader(filename)
    elif filetype == 'mzml':
        data = mzML_reader(filename)
    else:
        print "file type not valid"
    

    # build integer intensity matrix
    im = build_intensity_matrix_i(data)

    for null_ion in null_ions:
        im.null_mass(null_ion)

    im.crop_mass(crop_ions[0], crop_ions[1])

    # get the size of the intensity matrix
    n_scan, n_mz = im.get_size()

    # smooth data
    for ii in range(n_mz):
        ic = im.get_ic_at_index(ii)
        ic1 = savitzky_golay(ic, points)
        ic_smooth = savitzky_golay(ic1, points)
        ic_base = tophat(ic_smooth, struct="1.5m")
        im.set_ic_at_index(ii, ic_base)

    for mp in sample.get_missing_peaks():

        mp_rt = mp.get_rt()
        common_ion = mp.get_ci()
        qual_ion_1 = float(mp.get_qual_ion1())
        qual_ion_2 = float(mp.get_qual_ion2())
        

        ci_ion_chrom = im.get_ic_at_mass(common_ion)
        print "ci = ",common_ion
        qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1)
        print "qi1 = ", qual_ion_1
        qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2)
        print "qi2 = ", qual_ion_2
        ######
        # Integrate the CI around that particular RT
        #######

        #Convert time to points
        # How long between scans?
        
        points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt))
        points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt)-rt_window)
        print "rt_window = ", points_1 - points_2

        rt_window_points = points_1 - points_2

        maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, \
                                                  rt_window_points)

        large_peaks = []

        for rt, intens in maxima_list:
            if intens > threshold:
                q1_index = qi1_ion_chrom.get_index_at_time(rt)
                q2_index = qi2_ion_chrom.get_index_at_time(rt)

                q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index)
                q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index)

                if q1_intensity > threshold/2 and q2_intensity > threshold/2:
                    large_peaks.append([rt, intens])
                
        print('found %d peaks above threshold'%len(large_peaks))

        areas = []
        for peak in large_peaks:
            apex = ci_ion_chrom.get_index_at_time(peak[0])
            ia = ci_ion_chrom.get_intensity_array().tolist()
            area, left, fight, l_share, r_share = ion_area(ia, apex, 0)
            areas.append(area)
        ########################
        areas.sort()
        if len(areas)>0:
            biggest_area = areas[-1]
            mp.set_ci_area(biggest_area)
            print "found area:", biggest_area, "at rt:", mp_rt
        else:
            print "Missing peak at rt = ", mp_rt
            mp.set_ci_area('na')
Esempio n. 3
0
"""proc.py
"""

import sys
sys.path.append("/x/PyMS")

from pyms.GCMS.IO.MZML.Function import mzML_reader

# read the raw data
mzml_file = "/x/PyMS/data/TP1U-11-16_86-2207.mzML"
data = mzML_reader(mzml_file)

# raw data operations
print "minimum mass found in all data: ", data.get_min_mass()
print "maximum mass found in all data: ", data.get_max_mass()

# time
time = data.get_time_list()
print "number of retention times: ", len(time)
print "retention time of 1st scan: ", time[0], "sec"
print "index of 400sec in time_list: ", data.get_index_at_time(400.0)

# TIC
tic = data.get_tic()
print "number of scans in TIC: ", len(tic)
print "start time of TIC: ", tic.get_time_at_index(0), "sec"

# raw scans
scans = data.get_scan_list()

print "number of masses in 1st scan: ", len(scans[0])