Ejemplo n.º 1
0
# In[4]:

print(data.max_mass)

# A list of the first 10 retention times can be returned with:

# In[5]:

print(data.time_list[:10])

# The index of a specific retention time (in seconds) can be returned with:
#

# In[6]:

data.get_index_at_time(400.0)

# Note that this returns the index of the retention time in the
# data closest to the given retention time of 400.0 seconds.
#
# The |GCMS_data.tic| attribute
# returns a total ion chromatogram (TIC) of the data
# as an |IonChromatogram| object:

# In[7]:

print(data.tic)

# The |IonChromatogram| object is explained in a later example.
#
# ### A Scan Object
Ejemplo n.º 2
0
# read the raw data
jcamp_file = data_directory / "gc01_0812_066.jdx"
data = JCAMP_reader(jcamp_file)
print(data)

# raw data operations
print("minimum mass found in all data: ", data.min_mass)
print("maximum mass found in all data: ", data.max_mass)

# time
time = data.time_list
print(time)
print("number of retention times: ", len(time))
print("retention time of 1st scan: ", time[0], "sec")
print("index of 400sec in time_list: ", data.get_index_at_time(400.0))

# TIC
tic = data.tic
print(tic)
print("number of scans in TIC: ", len(tic))
print("start time of TIC: ", tic.get_time_at_index(0), "sec")

# raw scans
scans = data.scan_list
print(scans)
print(scans[0].mass_list)
print("1st mass value for 1st scan: ", scans[0].mass_list[0])
print("1st intensity value for 1st scan: ", scans[0].intensity_list[0])

print("minimum mass found in 1st scan: ", scans[0].min_mass)
Ejemplo n.º 3
0
def import_processing(jcamp_file, spectrum_csv_file, report_csv_file, combined_csv_file, bb_points = 9, bb_scans = 2, noise_thresh = 2, target_range = (0,120), tophat_struct="1.5m", nistpath = "../MSSEARCH", base_peak_filter = ['73'], ExprDir = "."):		
	global nist_path
	nist_path = nistpath
	
	# Parameters
	base_peak_filter = [int(x) for x in base_peak_filter]
	target_range = tuple(target_range)
	sample_name = os.path.splitext(os.path.basename(jcamp_file))[0]
	number_of_peaks = 80
	
	data = JCAMP_reader(jcamp_file)
	
	# list of all retention times, in seconds
	times = data.get_time_list()
	# get Total Ion Chromatogram
	tic = data.get_tic() 
	# RT Range, time step, no. scans, min, max, mean and median m/z
	data.info()
	
	#data.write("output/data") # save output
	
	# Mass Binning	
	im = build_intensity_matrix_i(data) # covnert to intensity matrix
	#im.get_size() #number of scans, number of bins
	masses = im.get_mass_list() # list of mass bins
	
	print(" Minimum m/z bin: {}".format(im.get_min_mass()))
	print(" Maximum m/z bin: {}".format(im.get_max_mass()))
	
	# Write Binned Mass Spectra to OpenChrom-like CSV file
	ms = im.get_ms_at_index(0) # first mass spectrum
	spectrum_csv = open(spectrum_csv_file, 'w')
	spectrum_csv.write('RT(milliseconds);RT(minutes) - NOT USED BY IMPORT;RI;')
	spectrum_csv.write(';'.join(str(mz) for mz in ms.mass_list))
	spectrum_csv.write("\n")
		
	for scan in range(len(times)):
		spectrum_csv.write("{};{};{};".format(int(times[scan]*1000),rounders((times[scan]/60),"0.0000000000"),0))	
		ms = im.get_ms_at_index(scan)
		spectrum_csv.write(';'.join(str(intensity) for intensity in ms.mass_spec))
		spectrum_csv.write('\n')
	spectrum_csv.close()
	
	## Data filtering

	# Note that Turbomass does not use smoothing for qualitative method.	
	# Top-hat baseline Correction seems to bring down noise,
	#  retaning shapes, but keeps points on actual peaks
	
	#dump_object(im, "output/im.dump") # un-processed output

	n_scan, n_mz = im.get_size()
	for ii in range(n_mz):
		#print("\rWorking on IC#", ii+1, '  ',end='')
		ic = im.get_ic_at_index(ii)
		ic_smooth = savitzky_golay(ic)
		ic_bc = tophat(ic_smooth, struct=tophat_struct)
		im.set_ic_at_index(ii, ic_bc)

	#dump_object(im, "output/im-proc.dump") # processed output
		
	# Peak Detection based on Biller and Biemann, 1974, with a window
	#  of n points, and combining y scans if they apex next to each other
	peak_list = BillerBiemann(im, points=bb_points, scans=bb_scans) 
	
	print(" Number of peaks identified before filtering: {}".format(len(peak_list)))
	
	# Filtering peak lists with automatic noise filtering
	noise_level = window_analyzer(tic)
	peak_list = num_ions_threshold(peak_list, noise_thresh, noise_level)
	# why use 2 for number of ions above threshold?
	print(" Number of peaks identified: {}".format(len(peak_list)))

	# Peak Areas
	peak_area_list = []
	filtered_peak_list = []
	
	for peak in peak_list:
		apex_mass_list = peak.get_mass_spectrum().mass_list
		apex_mass_spec = peak.get_mass_spectrum().mass_spec
		base_peak_intensity = max(apex_mass_spec)
		base_peak_index = [index for index, intensity in enumerate(apex_mass_spec) if intensity == base_peak_intensity][0]
		base_peak_mass = apex_mass_list[base_peak_index]
		#print(base_peak_mass)
		if base_peak_mass in base_peak_filter:
			continue # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed
		
		area = peak_sum_area(im, peak)
		peak.set_area(area)
		peak_area_list.append(area)
		filtered_peak_list.append(peak)
	
	# Save the TIC and Peak List
	tic.write(os.path.join(ExprDir,"{}_tic.dat".format(sample_name)),formatting=False)
	store_peaks(filtered_peak_list,os.path.join(ExprDir,"{}_peaks.dat".format(sample_name)))
	
	# from https://stackoverflow.com/questions/16878715/how-to-find-the-index-of-n-largest-elements-in-a-list-or-np-array-python?lq=1
	top_peaks = sorted(range(len(peak_area_list)), key=lambda x: peak_area_list[x])
	
	# Write to turbomass-like CSV file
	report_csv = open(report_csv_file, "w")
	
	# Write to GunShotMatch Combine-like CSV file
	combine_csv = open(combined_csv_file, "w")
	
	combine_csv.write(sample_name)
	combine_csv.write("\n")
		
	report_csv.write("#;RT;Scan;Height;Area\n")
	combine_csv.write("Retention Time;Peak Area;;Lib;Match;R Match;Name;CAS Number;Scan\n")
	
	report_buffer = []
	
	for index in top_peaks:
		# Peak Number (1-80)
		peak_number = top_peaks.index(index)+1 
		# Retention time (minutes, 3dp)
		RT = rounders(filtered_peak_list[index].get_rt()/60,"0.000") 
		
		if not target_range[0] < RT <= target_range[1]:
			continue # skip the peak if it is outside the desired range
		
		# scan number, not that we really nead it as the peak object has the spectrum
		Scan = data.get_index_at_time(filtered_peak_list[index].get_rt())+1 
		# the binned mass spectrum
		filtered_peak_list[index].get_mass_spectrum() 
		# TIC intensity, as proxy for Peak height, which should be from baseline
		Height = '{:,}'.format(rounders(tic.get_intensity_at_index(data.get_index_at_time(filtered_peak_list[index].get_rt())),"0"))
		# Peak area, originally in "intensity seconds", so dividing by 60 to
		#  get "intensity minutes" like turbomass uses
		Area = '{:,}'.format(rounders(filtered_peak_list[index].get_area()/60,"0.0")) 
		
		#report_csv.write("{};{};{};{};{};{}\n".format(peak_number, RT, Scan, Height, Area,bounds))
		report_buffer.append([peak_number, RT, Scan, Height, Area])

	report_buffer = report_buffer[::-1] # Reverse list order

	# List of peaks already added to report
	existing_peaks = []

	filtered_report_buffer = []
	
	for row in report_buffer:
		filtered_report_buffer.append(row)
	
	filtered_report_buffer = filtered_report_buffer[:number_of_peaks]
	
	filtered_report_buffer.sort(key=operator.itemgetter(2))
	
	for row in filtered_report_buffer:
		index = filtered_report_buffer.index(row)
		report_csv.write(";".join([str(i) for i in row]))
		
		ms = im.get_ms_at_index(row[2]-1)
		
		create_msp("{}_{}".format(sample_name,row[1]),ms.mass_list, ms.mass_spec)
		matches_dict = nist_ms_comparison("{}_{}".format(sample_name,row[1]),ms.mass_list, ms.mass_spec)
		
		combine_csv.write("{};{};Page {} of 80;;;;;;{}\n".format(row[1],row[4],index+1,row[2]))
		
		for hit in range(1,6):
			report_csv.write(str(matches_dict["Hit{}".format(hit)]))
			report_csv.write(";")
			combine_csv.write(";;{};{};{};{};{};{};\n".format(hit,
					matches_dict["Hit{}".format(hit)]["Lib"],
					matches_dict["Hit{}".format(hit)]["MF"],
					matches_dict["Hit{}".format(hit)]["RMF"],
					matches_dict["Hit{}".format(hit)]["Name"],
					matches_dict["Hit{}".format(hit)]["CAS"],
					))

		report_csv.write("\n")
		
		time.sleep(2)
		
	report_csv.close()
	combine_csv.close()
	
	# Create an experiment
	expr = Experiment(sample_name, filtered_peak_list)
	expr.sele_rt_range(["{}m".format(target_range[0]),"{}m".format(target_range[1])])
	store_expr(os.path.join(ExprDir,"{}.expr".format(sample_name)), expr)
	
	return 0