Beispiel #1
0
    def test_BillerBiemann(self, im_i):
        im_i = copy.deepcopy(im_i)
        # Intensity matrix size (scans, masses)
        n_scan, n_mz = im_i.size

        # noise filter and baseline correct
        for ii in range(n_mz):
            ic = im_i.get_ic_at_index(ii)
            ic_smooth = savitzky_golay(ic)
            ic_bc = tophat(ic_smooth, struct="1.5m")
            im_i.set_ic_at_index(ii, ic_bc)

        # Use Biller and Biemann technique to find apexing ions at a scan
        # default is maxima over three scans and not to combine with any neighbouring
        # scan.
        peak_list = BillerBiemann(im_i)
        assert isinstance(peak_list, list)
        assert len(peak_list) == 2101
        for peak in peak_list:
            assert isinstance(peak, Peak)

        # Find apex oven 9 points and combine with neighbouring peak if two scans apex
        # next to each other.
        peak_list2 = BillerBiemann(im_i, points=9, scans=2)
        assert len(peak_list2) == 805

        assert len(peak_list2) <= len(peak_list)
Beispiel #2
0
def test_smooth_im(data):
    # Build intensity matrix with defaults, float masses with interval
    # (bin size) of one from min mass
    im = build_intensity_matrix_i(data)

    im.min_mass
    n_scan, n_mz = im.size

    # process data
    for ii in range(n_mz):
        # print("Working on IC#", ii + 1)
        ic = im.get_ic_at_index(ii)
        assert isinstance(ic, IonChromatogram)

        # if ((ii+off) in [319, 205, 160, 217]):
        # 	ic.write("output/ic-raw-%d.dat" % (ii+off))

        ic_smooth = savitzky_golay(ic)
        assert isinstance(ic_smooth, IonChromatogram)

        ic_bc = tophat(ic_smooth, struct="1.5m")
        assert isinstance(ic_bc, IonChromatogram)

        # if ((ii+off) in [319, 205, 160, 217]):
        # 	ic_bc.write("output/ic-flt-%d.dat" % (ii+off))

        im.set_ic_at_index(ii, ic_bc)
        assert im.get_ic_at_index(ii) == ic_bc
Beispiel #3
0
def expr_list(pyms_datadir):

    with tempfile.TemporaryDirectory() as tmpdir:
        outputdir = pathlib.Path(tmpdir)

        # Create experiment files
        for jcamp_file in eley_codes:

            im = build_intensity_matrix_i(
                JCAMP_reader(pyms_datadir / f"{jcamp_file}.JDX"))

            # Intensity matrix size (scans, masses)
            n_scan, n_mz = im.size

            # noise filter and baseline correct
            for ii in range(n_mz):
                ic = im.get_ic_at_index(ii)
                ic_smooth = savitzky_golay(ic)
                ic_bc = tophat(ic_smooth, struct="1.5m")
                im.set_ic_at_index(ii, ic_bc)

            peak_list = BillerBiemann(im, points=9, scans=2)

            print('#')
            apl = rel_threshold(peak_list, 2)
            new_peak_list = num_ions_threshold(apl, 3, 3000)
            print('#')

            # ignore TMS ions and set mass range
            for peak in new_peak_list:
                peak.crop_mass(50, 400)
                peak.null_mass(73)
                peak.null_mass(147)

                # find area
                area = peak_sum_area(im, peak)
                peak.area = area
                area_dict = peak_top_ion_areas(im, peak)
                peak.ion_areas = area_dict

            expr = Experiment(jcamp_file, new_peak_list)

            # set time range for all experiments
            expr.sele_rt_range(["6.5m", "21m"])

            print('#')
            expr.dump(outputdir / f"{jcamp_file}.expr")
            print('#')

        # Load experiments
        expr_list = []
        for expr_code in eley_codes:
            expr = load_expr(outputdir / f"{expr_code}.expr")
            assert isinstance(expr, Experiment)
            expr_list.append(expr)

        yield expr_list
Beispiel #4
0
def _peak_list(im_i):
    im_i = deepcopy(im_i)

    # Intensity matrix size (scans, masses)
    n_scan, n_mz = im_i.size

    # noise filter and baseline correct
    for ii in range(n_mz):
        ic = im_i.get_ic_at_index(ii)
        ic_smooth = savitzky_golay(ic)
        ic_bc = tophat(ic_smooth, struct="1.5m")
        im_i.set_ic_at_index(ii, ic_bc)

    # Use Biller and Biemann technique to find apexing ions at a scan
    # default is maxima over three scans and not to combine with any neighbouring
    # scan.
    peak_list = BillerBiemann(im_i, points=9, scans=2)
    return peak_list
Beispiel #5
0
def Preprocess_IntensityMatrixes(matrixes):
    # noise removal and baseline correction of Intensity Matricies
    #input matrix list, outputs corrected matrix list

    count = 1
    for im in matrixes:

        n_s, n_mz = im.get_size()
        count += 1

        for ii in range(n_mz):

            print("Working on IC#", ii + 1, " Unit", count)
            ic = im.get_ic_at_index(ii)
            ic_smoof = savitzky_golay(ic)
            ic_bc = tophat(ic_smoof, struct='1.5m')
            im.set_ic_at_index(ii, ic_bc)

    return (matrixes)  #save to file
def Preprocess_IntensityMatrices(matrices):
    """
    Baseline correction and smoothing of Intensity Matrices
    input matrix list, outputs corrected/"cleansed" matrix list

    @param matrices: List of matrices generated by the matrix_from_cdf method
    @return: List of matrices that have been baseline corrected & smoothed for peak detection
    """

    count = 1
    for im in matrices:

        n_s, n_mz = im.get_size()
        count += 1

        for ii in range(n_mz):
            ic = im.get_ic_at_index(ii)
            ic_smoof = savitzky_golay(ic)
            ic_bc = tophat(ic_smoof, struct='1.5m')
            im.set_ic_at_index(ii, ic_bc)

    return (matrices)  # save to file
Beispiel #7
0
def Preprocess_IntensityMatrixes(matrixes):
    '''
    noise removal and baseline correction of Intensity Matricies
    input matrix list, outputs corrected/"cleansed" matrix list

    @param matrixes: List of matrixes generated by the matrix_from_cdf method
    @return: List of matrixes that have been 'cleansed'
    '''

    count = 1
    for im in matrixes:

        n_s, n_mz = im.get_size()
        count += 1

        for ii in range(n_mz):
            # print("Working on IC#", ii+1, " Unit", count)
            ic = im.get_ic_at_index(ii)
            ic_smoof = savitzky_golay(ic)
            ic_bc = tophat(ic_smoof, struct='1.5m')
            im.set_ic_at_index(ii, ic_bc)

    # print(matrixes)
    return (matrixes)  # save to file
def test_savitzky_golay(tic):
    assert isinstance(tic, IonChromatogram)

    # apply noise smoothing
    tic1 = savitzky_golay(tic)
    assert isinstance(tic1, IonChromatogram)

    assert tic1 != tic
    assert tic1.is_tic()
    assert len(tic1) == 2103
    assert len(tic) == len(tic1)  # Length should be unchanged
    assert tic1.get_intensity_at_index(test_int) == 421885.76190476184
    assert tic1.get_time_at_index(test_int) == 1304.15599823
    assert tic1.get_time_at_index(test_int) == tic.get_time_at_index(test_int)
    assert tic1.time_list[0] == 1.05200003833
    assert tic1.time_list[0] == tic.time_list[0]
    assert tic1.time_step == 1.0560000035830972
    assert tic1.time_step == tic1.time_step
    assert tic1.get_index_at_time(12) == 10
    assert tic1.get_index_at_time(12) == tic1.get_index_at_time(12)

    with pytest.warns(Warning):
        tic1.mass

    # Test Errors
    for obj in [test_string, *test_numbers, *test_lists, test_dict]:
        with pytest.raises(TypeError):
            savitzky_golay(obj)  # type: ignore

    for obj in [test_string, test_float, *test_lists, test_dict]:
        with pytest.raises(TypeError):
            savitzky_golay(tic, degree=obj)  # type: ignore

    for obj in [test_float, *test_lists, test_dict]:
        with pytest.raises(TypeError):
            savitzky_golay(tic, window=obj)  # type: ignore
    def run(self):
        print("Quantitative Processing in Progress...")

        # TODO: Include data etc. in experiment file

        self.update_pbar()

        if self.filetype == ID_Format_jcamp:
            # Load data using JCAMP_reader
            from pyms.GCMS.IO.JCAMP import JCAMP_reader
            data = JCAMP_reader(self.properties["Original Filename"])

        elif self.filetype == ID_Format_mzML:
            # Load data using JCAMP_reader
            from pyms.GCMS.IO.MZML import MZML_reader
            data = MZML_reader(self.properties["Original Filename"])

        elif self.filetype == ID_Format_ANDI:
            # Load data using JCAMP_reader
            from pyms.GCMS.IO.ANDI import ANDI_reader
            data = ANDI_reader(self.properties["Original Filename"])

        else:
            # Unknown Format
            return
        # TODO: Waters RAW, Thermo RAW, Agilent .d

        self.update_pbar()

        method = Method.Method(self.properties["Method"])

        self.update_pbar()

        # list of all retention times, in seconds
        times = data.get_time_list()
        # get Total Ion Chromatogram
        tic = data.get_tic()
        # RT Range, time step, no. scans, min, max, mean and median m/z
        data.info()

        # Build "intensity matrix" by binning data with integer bins and a
        # 	window of -0.3 to +0.7, the same as NIST uses
        im = build_intensity_matrix_i(data)

        self.update_pbar()

        # Show the m/z of the maximum and minimum bins
        print(" Minimum m/z bin: {}".format(im.get_min_mass()))
        print(" Maximum m/z bin: {}".format(im.get_max_mass()))

        # Crop masses
        min_mass, max_mass, *_ = method.mass_range

        if min_mass < im.get_min_mass():
            min_mass = im.get_min_mass()
        if max_mass > im.get_max_mass():
            max_mass = im.get_max_mass()
        im.crop_mass(min_mass, max_mass)

        self.update_pbar()

        # Perform Data filtering
        n_scan, n_mz = im.get_size()

        # Iterate over each IC in the intensity matrix
        for ii in range(n_mz):
            # print("\rWorking on IC#", ii+1, '  ',end='')
            ic = im.get_ic_at_index(ii)

            if method.enable_sav_gol:
                # Perform Savitzky-Golay smoothing.
                # Note that Turbomass does not use smoothing for qualitative method.
                ic = savitzky_golay(ic)

            if method.enable_tophat:
                # Perform Tophat baseline correction
                # Top-hat baseline Correction seems to bring down noise,
                #  		retaining shapes, but keeps points on actual peaks
                ic = tophat(ic, struct=method.tophat_struct)

            # Set the IC in the intensity matrix to the filtered one
            im.set_ic_at_index(ii, ic)

            self.update_pbar()

        # Peak Detection based on Biller and Biemann (1974), with a window
        # 	of <points>, and combining <scans> if they apex next to each other
        peak_list = BillerBiemann(im,
                                  points=method.bb_points,
                                  scans=method.bb_scans)

        self.update_pbar()

        print(" Number of peaks identified before filtering: {}".format(
            len(peak_list)))

        if method.enable_noise_filter:
            # Filtering peak lists with automatic noise filtering
            noise_level = window_analyzer(tic)
            # should we also do rel_threshold() here?
            # https://pymassspec.readthedocs.io/en/master/pyms/BillerBiemann.html#pyms.BillerBiemann.rel_threshold
            peak_list = num_ions_threshold(peak_list, method.noise_thresh,
                                           noise_level)

        self.update_pbar()

        filtered_peak_list = []

        for peak in peak_list:
            # Get mass and intensity lists for the mass spectrum at the apex of the peak
            apex_mass_list = peak.mass_spectrum.mass_list
            apex_mass_spec = peak.mass_spectrum.mass_spec

            # Determine the intensity of the base peak in the mass spectrum
            base_peak_intensity = max(apex_mass_spec)

            # Determine the index of the base peak in the mass spectrum
            base_peak_index = [
                index for index, intensity in enumerate(apex_mass_spec)
                if intensity == base_peak_intensity
            ][0]

            # Finally, determine the mass of the base peak
            base_peak_mass = apex_mass_list[base_peak_index]

            # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed
            if base_peak_mass in method.base_peak_filter:
                continue

            area = peak_sum_area(im, peak)
            peak.set_area(area)
            filtered_peak_list.append(peak)

            self.update_pbar()

        print(" Number of peaks identified: {}".format(
            len(filtered_peak_list)))

        # Create an experiment
        self.expr = Experiment(self.sample_name, filtered_peak_list)
        self.expr.sele_rt_range([
            "{}m".format(method.target_range[0]),
            "{}m".format(method.target_range[1])
        ])

        self.update_pbar()

        current_time = time_now()

        # The date and time the experiment was created
        self.properties["Date Created"] = current_time

        # The date and time the experiment was last modified
        self.properties["Date Modified"] = current_time

        if self.pbar:
            self.pbar.Update(self.pbar.Range)

        self.tic = tic
        self.filtered_peak_list = filtered_peak_list
Beispiel #10
0
def test_align_2_alignments(A1, pyms_datadir, tmp_pathplus):
    expr_list = []

    for jcamp_file in geco_codes:
        im = build_intensity_matrix_i(
            JCAMP_reader(pyms_datadir / f"{jcamp_file}.JDX"))

        # Intensity matrix size (scans, masses)
        n_scan, n_mz = im.size

        # noise filter and baseline correct
        for ii in range(n_mz):
            ic = im.get_ic_at_index(ii)
            ic_smooth = savitzky_golay(ic)
            ic_bc = tophat(ic_smooth, struct="1.5m")
            im.set_ic_at_index(ii, ic_bc)

        peak_list = BillerBiemann(im, points=9, scans=2)
        apl = rel_threshold(peak_list, 2)
        new_peak_list = num_ions_threshold(apl, 3, 3000)

        # ignore TMS ions and set mass range
        for peak in new_peak_list:
            peak.crop_mass(50, 400)
            peak.null_mass(73)
            peak.null_mass(147)

            # find area
            area = peak_sum_area(im, peak)
            peak.area = area
            area_dict = peak_top_ion_areas(im, peak)
            peak.ion_areas = area_dict

        expr = Experiment(jcamp_file, new_peak_list)

        # set time range for all experiments
        expr.sele_rt_range(["6.5m", "21m"])

        expr_list.append(expr)

    F2 = exprl2alignment(expr_list)
    T2 = PairwiseAlignment(F2, Dw, Gw)
    A2 = align_with_tree(T2, min_peaks=2)

    # top_ion_list = A2.common_ion()
    # A2.write_common_ion_csv(tmp_pathplus/'area1.csv', top_ion_list)

    # between replicates alignment parameters
    Db = 10.0  # rt modulation
    Gb = 0.30  # gap penalty

    print("Aligning input {1,2}")
    T9 = PairwiseAlignment([A1, A2], Db, Gb)
    A9 = align_with_tree(T9)

    A9.write_csv(tmp_pathplus / "rt.csv", tmp_pathplus / "area.csv")

    aligned_peaks = list(filter(None, A9.aligned_peaks()))
    store_peaks(aligned_peaks, tmp_pathplus / "peaks.bin")

    top_ion_list = A9.common_ion()
    A9.write_common_ion_csv(tmp_pathplus / "area.csv", top_ion_list)
Beispiel #11
0
"""proc.py
"""

import sys
sys.path.append("/x/PyMS/")

from pyms.GCMS.IO.ANDI.Function import ANDI_reader
from pyms.Noise.SavitzkyGolay import savitzky_golay
from pyms.Baseline.TopHat import tophat

# read the raw data
andi_file = "/x/PyMS/data/gc01_0812_066.cdf"
data = ANDI_reader(andi_file)

# get the TIC
tic = data.get_tic()

# apply noise smoothing and baseline correction
tic1 = savitzky_golay(tic)
tic2 = tophat(tic1, struct="1.5m")

# save smoothed/baseline corrected TIC
tic.write("output/tic.dat", minutes=True)
tic1.write("output/tic_smooth.dat", minutes=True)
tic2.write("output/tic_smooth_bc.dat", minutes=True)
Beispiel #12
0
ic_smooth1 = im_smooth1.get_ic_at_index(73)

ic.write(output_directory / "noise_smoothing_ic.dat", minutes=True)
ic_smooth1.write(output_directory / "noise_smoothing_ic_smooth1.dat",
                 minutes=True)

# ## Savitzky--Golay noise filter
#
# A more sophisticated noise filter is the Savitzky-Golay filter.
# Given the data loaded as above, this filter can be applied as
# follows:

# In[9]:

from pyms.Noise.SavitzkyGolay import savitzky_golay
tic4 = savitzky_golay(tic)

# Write the smoothed TIC to disk:

# In[10]:

tic4.write(output_directory / "noise_smoothing_tic4.dat", minutes=True)

# In this example the default parameters were used.
#
# ### Savitzky-Golay Noise filtering of Intensity Matrix Object
#
# The |savitzky_golay()| function described above acts on a single
# |IonChromatogram|. Where it is desired to perform Savitzky Golay
# filtering on the whole |IntensityMatrix| the function
# |savitzky_golay_im()| may be used as follows:
Beispiel #13
0
 # read in raw data
andi_file = "/x/PyMS/data/gc01_0812_066.cdf"
data = ANDI_reader(andi_file)

data.trim(4101, 4350)

# Build Intensity Matrix
real_im = build_intensity_matrix_i(data)

n_scan, n_mz = real_im.get_size()

 # perform necessary pre filtering
for ii in range(n_mz):
    ic = real_im.get_ic_at_index(ii)
    ic_smooth = savitzky_golay(ic)
    ic_bc = tophat(ic_smooth, struct="1.5m")
    real_im.set_ic_at_index(ii, ic_bc)
    
    
 # Detect Peaks
peak_list = BillerBiemann(real_im, points=3, scans=2)

print "Number of peaks found in real data: ", len(peak_list) 

######### Filter peaks###############
# Filter the peak list,
# first by removing all intensities in a peak less than a given relative
# threshold,
# then by removing all peaks that have less than a given number of ions above
# a given value
Beispiel #14
0
def missing_peak_finder(sample, andi_file, points=7, null_ions=[73, 207],\
                            crop_ions=[45,300], threshold=100000, rt_window=10):
    """
    @summary: Integrates raw data around missing peak locations
              to fill in NAs in the data matrix

    @param  sample: The sample object containing missing peaks
    @type sample: pyms.MissingPeak.Class.Sample

    @param  andi_file: Name of the raw data file
    @type andi_file: stringType

    @param  points: Peak finding - Peak if maxima over 'points' \
                    number of scans (Default 3)
    @type points: intType

    @param  null_ions: Ions to be deleted in the matrix
    @type null_ions: listType

    @param crop_ions: Range of Ions to be considered
    @type crop_ions: listType

    @param threshold: Minimum intensity of IonChromatogram allowable to fill\
                      missing peak
    @type threshold: intType

    @param  rt_window: Window in seconds around average RT to look for \
                       missing peak
    @type rt_window: floatType

    @author: Sean O'Callaghan
    """

    ### some error checks on null and crop ions

    ### a for root,files,dirs in os.path.walk(): loop
    print "Sample:", sample.get_name(), "andi_file:", andi_file

    data = ANDI_reader(andi_file)

    # build integer intensity matrix
    im = build_intensity_matrix_i(data)

    for null_ion in null_ions:
        im.null_mass(null_ion)

    im.crop_mass(crop_ions[0], crop_ions[1])

    # get the size of the intensity matrix
    n_scan, n_mz = im.get_size()

    # smooth data
    for ii in range(n_mz):
        ic = im.get_ic_at_index(ii)
        ic1 = savitzky_golay(ic, points)
        ic_smooth = savitzky_golay(ic1, points)
        ic_base = tophat(ic_smooth, struct="1.5m")
        im.set_ic_at_index(ii, ic_base)

    for mp in sample.get_missing_peaks():
        #JT: Debug peak attributes
        #attrs = vars(mp)
        #print ', '.join("%s: %s" % item for item in attrs.items())
        mp_rt = mp.get_rt()
        #print(repr(mp_rt))
        common_ion = mp.get_ci()
        qual_ion_1 = float(mp.get_qual_ion1())
        qual_ion_2 = float(mp.get_qual_ion2())

        ci_ion_chrom = im.get_ic_at_mass(common_ion)
        #print "ci = ",common_ion
        qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1)
        #print "qi1 = ", qual_ion_1
        qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2)
        #print "qi2 = ", qual_ion_2
        ######
        # Integrate the CI around that particular RT
        #######

        #Convert time to points
        # How long between scans?

        points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt))
        points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window)
        #print "rt_window = ", points_1 - points_2

        rt_window_points = points_1 - points_2

        maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, \
                                                  rt_window_points)

        large_peaks = []

        for rt, intens in maxima_list:
            if intens > threshold:
                q1_index = qi1_ion_chrom.get_index_at_time(rt)
                q2_index = qi2_ion_chrom.get_index_at_time(rt)

                q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index)
                q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index)

                if q1_intensity > threshold / 2 and q2_intensity > threshold / 2:
                    large_peaks.append([rt, intens])

        #print('found %d peaks above threshold'%len(large_peaks))

        areas = []
        for peak in large_peaks:
            apex = ci_ion_chrom.get_index_at_time(peak[0])
            ia = ci_ion_chrom.get_intensity_array().tolist()
            area, left, fight, l_share, r_share = ion_area(ia, apex, 0)
            areas.append(area)

        ########################
        areas.sort()
        if len(areas) > 0:
            biggest_area = areas[-1]
            mp.set_ci_area(biggest_area)
            #print "found area:", biggest_area, "at rt:", mp_rt
        else:
            #print "Missing peak at rt = ", mp_rt
            mp.set_ci_area('NA')
def import_processing(jcamp_file, spectrum_csv_file, report_csv_file, combined_csv_file, bb_points = 9, bb_scans = 2, noise_thresh = 2, target_range = (0,120), tophat_struct="1.5m", nistpath = "../MSSEARCH", base_peak_filter = ['73'], ExprDir = "."):		
	global nist_path
	nist_path = nistpath
	
	# Parameters
	base_peak_filter = [int(x) for x in base_peak_filter]
	target_range = tuple(target_range)
	sample_name = os.path.splitext(os.path.basename(jcamp_file))[0]
	number_of_peaks = 80
	
	data = JCAMP_reader(jcamp_file)
	
	# list of all retention times, in seconds
	times = data.get_time_list()
	# get Total Ion Chromatogram
	tic = data.get_tic() 
	# RT Range, time step, no. scans, min, max, mean and median m/z
	data.info()
	
	#data.write("output/data") # save output
	
	# Mass Binning	
	im = build_intensity_matrix_i(data) # covnert to intensity matrix
	#im.get_size() #number of scans, number of bins
	masses = im.get_mass_list() # list of mass bins
	
	print(" Minimum m/z bin: {}".format(im.get_min_mass()))
	print(" Maximum m/z bin: {}".format(im.get_max_mass()))
	
	# Write Binned Mass Spectra to OpenChrom-like CSV file
	ms = im.get_ms_at_index(0) # first mass spectrum
	spectrum_csv = open(spectrum_csv_file, 'w')
	spectrum_csv.write('RT(milliseconds);RT(minutes) - NOT USED BY IMPORT;RI;')
	spectrum_csv.write(';'.join(str(mz) for mz in ms.mass_list))
	spectrum_csv.write("\n")
		
	for scan in range(len(times)):
		spectrum_csv.write("{};{};{};".format(int(times[scan]*1000),rounders((times[scan]/60),"0.0000000000"),0))	
		ms = im.get_ms_at_index(scan)
		spectrum_csv.write(';'.join(str(intensity) for intensity in ms.mass_spec))
		spectrum_csv.write('\n')
	spectrum_csv.close()
	
	## Data filtering

	# Note that Turbomass does not use smoothing for qualitative method.	
	# Top-hat baseline Correction seems to bring down noise,
	#  retaning shapes, but keeps points on actual peaks
	
	#dump_object(im, "output/im.dump") # un-processed output

	n_scan, n_mz = im.get_size()
	for ii in range(n_mz):
		#print("\rWorking on IC#", ii+1, '  ',end='')
		ic = im.get_ic_at_index(ii)
		ic_smooth = savitzky_golay(ic)
		ic_bc = tophat(ic_smooth, struct=tophat_struct)
		im.set_ic_at_index(ii, ic_bc)

	#dump_object(im, "output/im-proc.dump") # processed output
		
	# Peak Detection based on Biller and Biemann, 1974, with a window
	#  of n points, and combining y scans if they apex next to each other
	peak_list = BillerBiemann(im, points=bb_points, scans=bb_scans) 
	
	print(" Number of peaks identified before filtering: {}".format(len(peak_list)))
	
	# Filtering peak lists with automatic noise filtering
	noise_level = window_analyzer(tic)
	peak_list = num_ions_threshold(peak_list, noise_thresh, noise_level)
	# why use 2 for number of ions above threshold?
	print(" Number of peaks identified: {}".format(len(peak_list)))

	# Peak Areas
	peak_area_list = []
	filtered_peak_list = []
	
	for peak in peak_list:
		apex_mass_list = peak.get_mass_spectrum().mass_list
		apex_mass_spec = peak.get_mass_spectrum().mass_spec
		base_peak_intensity = max(apex_mass_spec)
		base_peak_index = [index for index, intensity in enumerate(apex_mass_spec) if intensity == base_peak_intensity][0]
		base_peak_mass = apex_mass_list[base_peak_index]
		#print(base_peak_mass)
		if base_peak_mass in base_peak_filter:
			continue # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed
		
		area = peak_sum_area(im, peak)
		peak.set_area(area)
		peak_area_list.append(area)
		filtered_peak_list.append(peak)
	
	# Save the TIC and Peak List
	tic.write(os.path.join(ExprDir,"{}_tic.dat".format(sample_name)),formatting=False)
	store_peaks(filtered_peak_list,os.path.join(ExprDir,"{}_peaks.dat".format(sample_name)))
	
	# from https://stackoverflow.com/questions/16878715/how-to-find-the-index-of-n-largest-elements-in-a-list-or-np-array-python?lq=1
	top_peaks = sorted(range(len(peak_area_list)), key=lambda x: peak_area_list[x])
	
	# Write to turbomass-like CSV file
	report_csv = open(report_csv_file, "w")
	
	# Write to GunShotMatch Combine-like CSV file
	combine_csv = open(combined_csv_file, "w")
	
	combine_csv.write(sample_name)
	combine_csv.write("\n")
		
	report_csv.write("#;RT;Scan;Height;Area\n")
	combine_csv.write("Retention Time;Peak Area;;Lib;Match;R Match;Name;CAS Number;Scan\n")
	
	report_buffer = []
	
	for index in top_peaks:
		# Peak Number (1-80)
		peak_number = top_peaks.index(index)+1 
		# Retention time (minutes, 3dp)
		RT = rounders(filtered_peak_list[index].get_rt()/60,"0.000") 
		
		if not target_range[0] < RT <= target_range[1]:
			continue # skip the peak if it is outside the desired range
		
		# scan number, not that we really nead it as the peak object has the spectrum
		Scan = data.get_index_at_time(filtered_peak_list[index].get_rt())+1 
		# the binned mass spectrum
		filtered_peak_list[index].get_mass_spectrum() 
		# TIC intensity, as proxy for Peak height, which should be from baseline
		Height = '{:,}'.format(rounders(tic.get_intensity_at_index(data.get_index_at_time(filtered_peak_list[index].get_rt())),"0"))
		# Peak area, originally in "intensity seconds", so dividing by 60 to
		#  get "intensity minutes" like turbomass uses
		Area = '{:,}'.format(rounders(filtered_peak_list[index].get_area()/60,"0.0")) 
		
		#report_csv.write("{};{};{};{};{};{}\n".format(peak_number, RT, Scan, Height, Area,bounds))
		report_buffer.append([peak_number, RT, Scan, Height, Area])

	report_buffer = report_buffer[::-1] # Reverse list order

	# List of peaks already added to report
	existing_peaks = []

	filtered_report_buffer = []
	
	for row in report_buffer:
		filtered_report_buffer.append(row)
	
	filtered_report_buffer = filtered_report_buffer[:number_of_peaks]
	
	filtered_report_buffer.sort(key=operator.itemgetter(2))
	
	for row in filtered_report_buffer:
		index = filtered_report_buffer.index(row)
		report_csv.write(";".join([str(i) for i in row]))
		
		ms = im.get_ms_at_index(row[2]-1)
		
		create_msp("{}_{}".format(sample_name,row[1]),ms.mass_list, ms.mass_spec)
		matches_dict = nist_ms_comparison("{}_{}".format(sample_name,row[1]),ms.mass_list, ms.mass_spec)
		
		combine_csv.write("{};{};Page {} of 80;;;;;;{}\n".format(row[1],row[4],index+1,row[2]))
		
		for hit in range(1,6):
			report_csv.write(str(matches_dict["Hit{}".format(hit)]))
			report_csv.write(";")
			combine_csv.write(";;{};{};{};{};{};{};\n".format(hit,
					matches_dict["Hit{}".format(hit)]["Lib"],
					matches_dict["Hit{}".format(hit)]["MF"],
					matches_dict["Hit{}".format(hit)]["RMF"],
					matches_dict["Hit{}".format(hit)]["Name"],
					matches_dict["Hit{}".format(hit)]["CAS"],
					))

		report_csv.write("\n")
		
		time.sleep(2)
		
	report_csv.close()
	combine_csv.close()
	
	# Create an experiment
	expr = Experiment(sample_name, filtered_peak_list)
	expr.sele_rt_range(["{}m".format(target_range[0]),"{}m".format(target_range[1])])
	store_expr(os.path.join(ExprDir,"{}.expr".format(sample_name)), expr)
	
	return 0
Beispiel #16
0
# read raw data
andi_file = data_directory / "data/gc01_0812_066.cdf"
data = ANDI_reader(andi_file)

data.trim(4101, 4350)

# Build Intensity Matrix
real_im = build_intensity_matrix_i(data)

n_scan, n_mz = real_im.size

# perform necessary pre filtering
for ii in range(n_mz):
    ic = real_im.get_ic_at_index(ii)
    ic_smooth = savitzky_golay(ic)
    ic_bc = tophat(ic_smooth, struct="1.5m")
    real_im.set_ic_at_index(ii, ic_bc)

# Detect Peaks
peak_list = BillerBiemann(real_im, points=3, scans=2)

print("Number of peaks found in real data: ", len(peak_list))

######### Filter peaks###############
# Filter the peak list,
# first by removing all intensities in a peak less than a given relative
# threshold,
# then by removing all peaks that have less than a given number of ions above
# a given value
Beispiel #17
0
    rel_threshold, num_ions_threshold

# read the raw data as a GCMS_data object
andi_file = "data/gc01_0812_066.cdf"
data = ANDI_reader(andi_file)

im = build_intensity_matrix_i(data)

n_scan, n_mz = im.size

print("Intensity matrix size (scans, masses):", (n_scan, n_mz))

# noise filter and baseline correct
for ii in range(n_mz):
    ic = im.get_ic_at_index(ii)
    ic_smooth = savitzky_golay(ic)
    ic_bc = tophat(ic_smooth, struct="1.5m")
    im.set_ic_at_index(ii, ic_bc)

# Use Biller and Biemann technique to find apexing ions at a scan.
# Find apex oven 9 points and combine with neighbouring peak if two scans apex
# next to each other.
peak_list = BillerBiemann(im, points=9, scans=2)

print("Number of peaks found: ", len(peak_list))

# Filter the peak list,
# first by removing all intensities in a peak less than a given relative
# threshold,
# then by removing all peaks that have less than a given number of ions above
# a given value
Beispiel #18
0
def missing_peak_finder(sample, filename, points=13, null_ions=[73, 147],\
                            crop_ions=[50,540], threshold=1000, rt_window=1, filetype='cdf'):
    """
    @summary: Integrates raw data around missing peak locations
              to fill in NAs in the data matrix

    @param  sample: The sample object containing missing peaks
    @type sample: pyms.MissingPeak.Class.Sample

    @param  andi_file: Name of the raw data file
    @type andi_file: stringType

    @param  points: Peak finding - Peak if maxima over 'points' \
                    number of scans (Default 3) 
    @type points: intType

    @param  null_ions: Ions to be deleted in the matrix
    @type null_ions: listType

    @param crop_ions: Range of Ions to be considered
    @type crop_ions: listType 

    @param threshold: Minimum intensity of IonChromatogram allowable to fill\
                      missing peak
    @type threshold: intType

    @param  rt_window: Window in seconds around average RT to look for \
                       missing peak
    @type rt_window: floatType

    @author: Sean O'Callaghan
    """

    ### some error checks on null and crop ions

    ### a for root,files,dirs in os.path.walk(): loop
    print "Sample:", sample.get_name(), "File:", filename
    
    if filetype == 'cdf':
        data = ANDI_reader(filename)
    elif filetype == 'mzml':
        data = mzML_reader(filename)
    else:
        print "file type not valid"
    

    # build integer intensity matrix
    im = build_intensity_matrix_i(data)

    for null_ion in null_ions:
        im.null_mass(null_ion)

    im.crop_mass(crop_ions[0], crop_ions[1])

    # get the size of the intensity matrix
    n_scan, n_mz = im.get_size()

    # smooth data
    for ii in range(n_mz):
        ic = im.get_ic_at_index(ii)
        ic1 = savitzky_golay(ic, points)
        ic_smooth = savitzky_golay(ic1, points)
        ic_base = tophat(ic_smooth, struct="1.5m")
        im.set_ic_at_index(ii, ic_base)

    for mp in sample.get_missing_peaks():

        mp_rt = mp.get_rt()
        common_ion = mp.get_ci()
        qual_ion_1 = float(mp.get_qual_ion1())
        qual_ion_2 = float(mp.get_qual_ion2())
        

        ci_ion_chrom = im.get_ic_at_mass(common_ion)
        print "ci = ",common_ion
        qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1)
        print "qi1 = ", qual_ion_1
        qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2)
        print "qi2 = ", qual_ion_2
        ######
        # Integrate the CI around that particular RT
        #######

        #Convert time to points
        # How long between scans?
        
        points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt))
        points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt)-rt_window)
        print "rt_window = ", points_1 - points_2

        rt_window_points = points_1 - points_2

        maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt, \
                                                  rt_window_points)

        large_peaks = []

        for rt, intens in maxima_list:
            if intens > threshold:
                q1_index = qi1_ion_chrom.get_index_at_time(rt)
                q2_index = qi2_ion_chrom.get_index_at_time(rt)

                q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index)
                q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index)

                if q1_intensity > threshold/2 and q2_intensity > threshold/2:
                    large_peaks.append([rt, intens])
                
        print('found %d peaks above threshold'%len(large_peaks))

        areas = []
        for peak in large_peaks:
            apex = ci_ion_chrom.get_index_at_time(peak[0])
            ia = ci_ion_chrom.get_intensity_array().tolist()
            area, left, fight, l_share, r_share = ion_area(ia, apex, 0)
            areas.append(area)
        ########################
        areas.sort()
        if len(areas)>0:
            biggest_area = areas[-1]
            mp.set_ci_area(biggest_area)
            print "found area:", biggest_area, "at rt:", mp_rt
        else:
            print "Missing peak at rt = ", mp_rt
            mp.set_ci_area('na')
Beispiel #19
0
def missing_peak_finder(
    sample: Sample,
    file_name: str,
    points: int = 3,
    null_ions: Optional[List] = None,
    crop_ions: Optional[List] = None,
    threshold: int = 1000,
    rt_window: float = 1,
    filetype: MissingPeakFiletype = MZML,
):
    r"""
	Integrates raw data around missing peak locations to fill ``NA``\s in the data matrix.

	:param sample: The sample object containing missing peaks
	:param file_name: Name of the raw data file
	:param points: Peak finding - Peak if maxima over 'points' number of scans.
	:param null_ions: Ions to be deleted in the matrix.
	:default null_ions: ``[73, 147]``
	:param crop_ions: Range of Ions to be considered.
	:default crop_ions: ``[50, 540]``
	:param threshold: Minimum intensity of IonChromatogram allowable to fill.
	:param rt_window: Window in seconds around average RT to look for.
	:param filetype:

	:author: Sean O'Callaghan
	"""

    if not null_ions:
        null_ions = [73, 147]
    if not crop_ions:
        crop_ions = [50, 540]

    # TODO: some error checks on null and crop ions

    # TODO: a for root,files,dirs in os.path.walk(): loop
    print("Sample:", sample.name, "File:", file_name)

    if filetype == NETCDF:
        # this package
        from pyms.GCMS.IO.ANDI import ANDI_reader
        data = ANDI_reader(file_name)

    elif filetype == MZML:
        # this package
        from pyms.GCMS.IO.MZML import mzML_reader
        data = mzML_reader(file_name)

    else:
        print("file type not valid")

    # build integer intensity matrix
    im = build_intensity_matrix_i(data)

    for null_ion in null_ions:
        im.null_mass(null_ion)

    im.crop_mass(crop_ions[0], crop_ions[1])

    # get the size of the intensity matrix
    n_scan, n_mz = im.size

    # smooth data
    for ii in range(n_mz):
        ic = im.get_ic_at_index(ii)
        ic1 = savitzky_golay(ic, points)
        ic_smooth = savitzky_golay(ic1, points)
        ic_base = tophat(ic_smooth, struct="1.5m")
        im.set_ic_at_index(ii, ic_base)

    for mp in sample.missing_peaks:

        mp_rt = mp.rt
        common_ion = mp.common_ion
        qual_ion_1 = float(mp.qual_ion1)
        qual_ion_2 = float(mp.qual_ion2)

        ci_ion_chrom = im.get_ic_at_mass(common_ion)
        print("ci = ", common_ion)
        qi1_ion_chrom = im.get_ic_at_mass(qual_ion_1)
        print("qi1 = ", qual_ion_1)
        qi2_ion_chrom = im.get_ic_at_mass(qual_ion_2)
        print("qi2 = ", qual_ion_2)
        ######
        # Integrate the CI around that particular RT
        #######

        # Convert time to points
        # How long between scans?

        points_1 = ci_ion_chrom.get_index_at_time(float(mp_rt))
        points_2 = ci_ion_chrom.get_index_at_time(float(mp_rt) - rt_window)
        print("rt_window = ", points_1 - points_2)

        rt_window_points = points_1 - points_2

        maxima_list = get_maxima_list_reduced(ci_ion_chrom, mp_rt,
                                              rt_window_points)

        large_peaks = []

        for rt, intens in maxima_list:
            if intens > threshold:
                q1_index = qi1_ion_chrom.get_index_at_time(rt)
                q2_index = qi2_ion_chrom.get_index_at_time(rt)

                q1_intensity = qi1_ion_chrom.get_intensity_at_index(q1_index)
                q2_intensity = qi2_ion_chrom.get_intensity_at_index(q2_index)

                if q1_intensity > threshold / 2 and q2_intensity > threshold / 2:
                    large_peaks.append([rt, intens])

        print(f"found {len(large_peaks):d} peaks above threshold")

        areas = []
        for peak in large_peaks:
            apex = ci_ion_chrom.get_index_at_time(peak[0])
            ia = ci_ion_chrom.intensity_array.tolist()
            area, left, right, l_share, r_share = ion_area(ia, apex, 0)
            areas.append(area)

        ########################

        areas.sort()
        if len(areas) > 0:
            biggest_area = areas[-1]
            mp.common_ion_area = biggest_area
            # mp.exact_rt = f"{float(mp_rt) / 60.0:.3f}"
            mp.exact_rt = float(mp_rt) / 60.0
            print("found area:", biggest_area, "at rt:", mp_rt)
        else:
            print("Missing peak at rt = ", mp_rt)
            mp.common_ion_area = None
Beispiel #20
0
"""proc.py
"""

import sys
sys.path.append("/x/PyMS/")

from pyms.GCMS.IO.ANDI.Function import ANDI_reader
from pyms.Noise.SavitzkyGolay import savitzky_golay
from pyms.Baseline.TopHat import tophat

# read the raw data
andi_file = "/x/PyMS/data/gc01_0812_066.cdf"
data = ANDI_reader(andi_file)

# get the TIC
tic = data.get_tic()

# apply noise smoothing and baseline correction
tic1 = savitzky_golay(tic)
tic2 = tophat(tic1, struct="1.5m")

# save smoothed/baseline corrected TIC
tic.write("output/tic.dat",minutes=True)
tic1.write("output/tic_smooth.dat",minutes=True)
tic2.write("output/tic_smooth_bc.dat",minutes=True)

Beispiel #21
0
andi_file = data_directory / "a0806_077.cdf"
data = ANDI_reader(andi_file)
im = build_intensity_matrix_i(data)

# Preprocess the data (Savitzky-Golay smoothing and Tophat baseline detection)

# In[3]:

from pyms.Noise.SavitzkyGolay import savitzky_golay
from pyms.TopHat import tophat

n_scan, n_mz = im.size

for ii in range(n_mz):
    ic = im.get_ic_at_index(ii)
    ic1 = savitzky_golay(ic)
    ic_smooth = savitzky_golay(ic1)  # Why the second pass here?
    ic_bc = tophat(ic_smooth, struct="1.5m")
    im.set_ic_at_index(ii, ic_bc)

# Now the Biller and Biemann based technique can be applied to detect peaks.

# In[4]:

from pyms.BillerBiemann import BillerBiemann

pl = BillerBiemann(im, points=9, scans=2)
len(pl)

# Trim the peak list by relative intensity
# read the raw data as a GCMS_data object
data = ANDI_reader(andi_file)
#data.trim(2431, 2469)

# IntensityMatrix
# default, float masses with interval (bin interval) of one from min mass
print "default intensity matrix, bin interval = 1, boundary +/- 0.5"
im = build_intensity_matrix(data)
im.null_mass(73)
im.null_mass(147)

n_scan, n_mz = im.get_size()

for ii in range(n_mz):
    ic = im.get_ic_at_index(ii)
    ic_smooth = savitzky_golay(ic)
    ic_base = tophat(ic_smooth, struct="1.5m")
    im.set_ic_at_index(ii, ic_base)

# Load the experiment
exper = load_expr(expr_file)

# Load the peak list 
peak_list = exper.get_peak_list()

# Pass Ion Chromatograms into a list of ICs
n_mz = len(im.get_mass_list())
ic = []

for m in range(n_mz):
    ic.append(im.get_ic_at_index(m))
Beispiel #23
0
	def quantitative_processing(self, jcamp_file, log_stdout=True):
		"""
		Import JCAMP-DX Files

		:param jcamp_file:
		:type jcamp_file:
		:param log_stdout:
		:type log_stdout:
		
		:return:
		:rtype:
		"""
		
		# Determine the name of the sample from the filename
		sample_name = os.path.splitext(os.path.basename(jcamp_file))[0]
		
		# Log Stdout to File
		if log_stdout:
			sys.stdout = open(os.path.join(self.config.log_dir, sample_name + ".log"), "w")
		
		# Load data using JCAMP_reader
		data = JCAMP_reader(jcamp_file)
		
		# list of all retention times, in seconds
		times = data.get_time_list()
		# get Total Ion Chromatogram
		tic = data.get_tic()
		# RT Range, time step, no. scans, min, max, mean and median m/z
		data.info()
		
		# Build "intensity matrix" by binning data with integer bins and a
		# 	window of -0.3 to +0.7, the same as NIST uses
		im = build_intensity_matrix_i(data)
		
		# Show the m/z of the maximum and minimum bins
		print(" Minimum m/z bin: {}".format(im.get_min_mass()))
		print(" Maximum m/z bin: {}".format(im.get_max_mass()))
		
		# Crop masses
		min_mass, max_mass, *_ = self.config.mass_range
		
		if min_mass < im.get_min_mass():
			min_mass = im.get_min_mass()
		if max_mass > im.get_max_mass():
			max_mass = im.get_max_mass()
		im.crop_mass(min_mass, max_mass)
		
		# Perform Data filtering
		n_scan, n_mz = im.get_size()
		
		# Iterate over each IC in the intensity matrix
		for ii in range(n_mz):
			# print("\rWorking on IC#", ii+1, '  ',end='')
			ic = im.get_ic_at_index(ii)
			
			# Perform Savitzky-Golay smoothing.
			# Note that Turbomass does not use smoothing for qualitative method.
			ic_smooth = savitzky_golay(ic)
			
			# Perform Tophat baseline correction
			# Top-hat baseline Correction seems to bring down noise,
			#  		retaining shapes, but keeps points on actual peaks
			ic_bc = tophat(ic_smooth, struct=self.config.tophat_struct)
			
			# Set the IC in the intensity matrix to the filtered one
			im.set_ic_at_index(ii, ic_bc)
		
		# Peak Detection based on Biller and Biemann (1974), with a window
		# 	of <points>, and combining <scans> if they apex next to each other
		peak_list = BillerBiemann(im, points=self.config.bb_points, scans=self.config.bb_scans)
		
		print(" Number of peaks identified before filtering: {}".format(len(peak_list)))
		
		# Filtering peak lists with automatic noise filtering
		noise_level = window_analyzer(tic)
		# should we also do rel_threshold() here?
		# https://pymassspec.readthedocs.io/en/master/pyms/BillerBiemann.html#pyms.BillerBiemann.rel_threshold
		peak_list = num_ions_threshold(peak_list, self.config.noise_thresh, noise_level)
		
		filtered_peak_list = []
		
		for peak in peak_list:
			# Get mass and intensity lists for the mass spectrum at the apex of the peak
			apex_mass_list = peak.mass_spectrum.mass_list
			apex_mass_spec = peak.mass_spectrum.mass_spec
			
			# Determine the intensity of the base peak in the mass spectrum
			base_peak_intensity = max(apex_mass_spec)
			
			# Determine the index of the base peak in the mass spectrum
			base_peak_index = [
				index for index, intensity in enumerate(apex_mass_spec)
				if intensity == base_peak_intensity][0]
			
			# Finally, determine the mass of the base peak
			base_peak_mass = apex_mass_list[base_peak_index]
			
			# skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed
			if base_peak_mass in self.config.base_peak_filter:
				continue
			
			area = peak_sum_area(im, peak)
			peak.set_area(area)
			filtered_peak_list.append(peak)
			
		print(" Number of peaks identified: {}".format(len(filtered_peak_list)))
		
		# Save the TIC and Peak List
		tic.write(os.path.join(self.config.expr_dir, "{}_tic.dat".format(sample_name)), formatting=False)
		store_peaks(filtered_peak_list, os.path.join(self.config.expr_dir, "{}_peaks.dat".format(sample_name)))
		
		# Create an experiment
		expr = Experiment(sample_name, filtered_peak_list)
		expr.sele_rt_range(["{}m".format(self.config.target_range[0]), "{}m".format(self.config.target_range[1])])
		store_expr(os.path.join(self.config.expr_dir, "{}.expr".format(sample_name)), expr)