def get_data_for_a_compound(mz_ref, rt_ref, what_to_get, h5file, extra_time): """ A helper function to query the various metatlas data selection commands for a compound defined in an experimental atlas. Parameters ---------- MZref : a MetAtlas Object for a m/z reference Class this contains the m/z, m/z tolerance, and tolerance units to slice the m/z dimension RTref : a MetAtlas Object for a retention time reference Class this contains the rt min, max, peak, and units to slice the retention time dimension what_to_get : a list of strings this contains one or more of [ 'ms1_summary', 'eic', '2dhist', 'msms' ] h5_file : str Path to input_file polarity : int [0 or 1] for negative or positive ionzation Returns ------- """ #TODO : polarity should be handled in the experiment and not a loose parameter import numpy as np from metatlas import h5_query as h5q import tables #get a pointer to the hdf5 file fid = tables.open_file(h5file) #TODO: should be a "with open:" if mz_ref.detected_polarity == 'positive': polarity = 1 else: polarity = 0 mz_theor = mz_ref.mz if mz_ref.mz_tolerance_units == 'ppm': #convert to ppm ppm_uncertainty = mz_ref.mz_tolerance else: ppm_uncertainty = mz_ref.mz_tolerance / mz_ref.mz * 1e6 # if 'min' in rt_ref.rt_units: #convert to seconds # rt_min = rt_ref.rt_min / 60 # rt_max = rt_ref.rt_max / 60 # else: rt_min = rt_ref.rt_min rt_max = rt_ref.rt_max mz_min = mz_theor - mz_theor * ppm_uncertainty / 1e6 mz_max = mz_theor + mz_theor * ppm_uncertainty / 1e6 return_data = {} if 'ms1_summary' in what_to_get: #Get Summary Data #First get MS1 Raw Data ms_level = 1 return_data['ms1_summary'] = {} try: ms1_data = h5q.get_data(fid, ms_level=1, polarity=polarity, min_mz=mz_min, max_mz=mz_max, min_rt=rt_min, max_rt=rt_max) return_data['ms1_summary']['polarity'] = polarity return_data['ms1_summary']['mz_centroid'] = np.sum( np.multiply(ms1_data['i'], ms1_data['mz'])) / np.sum( ms1_data['i']) return_data['ms1_summary']['rt_centroid'] = np.sum( np.multiply(ms1_data['i'], ms1_data['rt'])) / np.sum( ms1_data['i']) idx = np.argmax(ms1_data['i']) return_data['ms1_summary']['mz_peak'] = ms1_data['mz'][idx] return_data['ms1_summary']['rt_peak'] = ms1_data['rt'][idx] return_data['ms1_summary']['peak_height'] = ms1_data['i'][idx] return_data['ms1_summary']['peak_area'] = np.sum(ms1_data['i']) except: return_data['ms1_summary']['polarity'] = [] return_data['ms1_summary']['mz_centroid'] = [] return_data['ms1_summary']['rt_centroid'] = [] return_data['ms1_summary']['mz_peak'] = [] return_data['ms1_summary']['rt_peak'] = [] return_data['ms1_summary']['peak_height'] = [] return_data['ms1_summary']['peak_area'] = [] if 'eic' in what_to_get: #Get Extracted Ion Chromatogram # TODO : If a person calls for summary, then they will already have the MS1 raw data return_data['eic'] = {} try: rt, intensity = h5q.get_chromatogram(fid, mz_min, mz_max, ms_level=ms_level, polarity=polarity, min_rt=rt_min - extra_time, max_rt=rt_max + extra_time) return_data['eic']['rt'] = rt return_data['eic']['intensity'] = intensity return_data['eic']['polarity'] = polarity except: return_data['eic']['rt'] = [] return_data['eic']['intensity'] = [] return_data['eic']['polarity'] = [] if '2dhist' in what_to_get: #Get 2D histogram of intensity values in m/z and retention time mzEdges = np.logspace(np.log10(100), np.log10(1000), 10000) # mzEdges = np.linspace(mz_theor - 3, mz_theor + 30,100) #TODO : number of mz bins should be an optional parameter rtEdges = np.linspace( rt_min, rt_max, 100 ) #TODO : number of rt bins should be an optional parameter. When not provided, it shoulddefauly to unique bins ms_level = 1 #TODO : ms_level should be a parameter return_data['2dhist'] = {} return_data['2dhist'] = h5q.get_heatmap(fid, mzEdges, rtEdges, ms_level, polarity) return_data['2dhist']['polarity'] = polarity if 'msms' in what_to_get: #Get Fragmentation Data ms_level = 2 return_data['msms'] = {} try: fragmentation_data = h5q.get_data( fid, ms_level=ms_level, polarity=polarity, min_mz=0, max_mz=mz_theor + 2, #TODO : this needs to be a parameter min_rt=rt_min, max_rt=rt_max, min_precursor_MZ=mz_min - 0.015, max_precursor_MZ=mz_max + 0.015 ) #Add the 0.01 because Thermo doesn't store accurate precursor m/z # min_precursor_intensity=0, #TODO : this needs to be a parameter # max_precursor_intensity=0,#TODO : this needs to be a parameter # min_collision_energy=0,#TODO : this needs to be a parameter # max_collision_energy=0)#TODO : this needs to be a parameter # prt,pmz = get_unique_scan_data(fragmentation_data) # rt_cutoff = 0.23 # mz_cutoff = 0.05 # list_of_prt,list_of_pmz = get_non_redundant_precursor_list(prt,pmz,rt_cutoff,mz_cutoff) # return_data['msms']['data'] = organize_msms_scan_data(fragmentation_data,list_of_prt,list_or_pmz) return_data['msms'][ 'most_intense_precursor'] = retrieve_most_intense_msms_scan( fragmentation_data) return_data['msms']['data'] = fragmentation_data return_data['msms']['polarity'] = polarity except: return_data['msms']['most_intense_precursor'] = [] return_data['msms']['data'] = [] return_data['msms']['polarity'] = [] fid.close() #close the file return return_data
def get_data_for_a_compound(mz_ref,rt_ref,what_to_get,h5file,extra_time): """ A helper function to query the various metatlas data selection commands for a compound defined in an experimental atlas. Parameters ---------- MZref : a MetAtlas Object for a m/z reference Class this contains the m/z, m/z tolerance, and tolerance units to slice the m/z dimension RTref : a MetAtlas Object for a retention time reference Class this contains the rt min, max, peak, and units to slice the retention time dimension what_to_get : a list of strings this contains one or more of [ 'ms1_summary', 'eic', '2dhist', 'msms' ] h5_file : str Path to input_file polarity : int [0 or 1] for negative or positive ionzation Returns ------- """ #TODO : polarity should be handled in the experiment and not a loose parameter import numpy as np from metatlas import h5_query as h5q import tables #get a pointer to the hdf5 file fid = tables.open_file(h5file) #TODO: should be a "with open:" if mz_ref.detected_polarity == 'positive': polarity = 1 else: polarity = 0 mz_theor = mz_ref.mz if mz_ref.mz_tolerance_units == 'ppm': #convert to ppm ppm_uncertainty = mz_ref.mz_tolerance else: ppm_uncertainty = mz_ref.mz_tolerance / mz_ref.mz * 1e6 # if 'min' in rt_ref.rt_units: #convert to seconds # rt_min = rt_ref.rt_min / 60 # rt_max = rt_ref.rt_max / 60 # else: rt_min = rt_ref.rt_min rt_max = rt_ref.rt_max mz_min = mz_theor - mz_theor * ppm_uncertainty / 1e6 mz_max = mz_theor + mz_theor * ppm_uncertainty / 1e6 return_data = {} if 'ms1_summary' in what_to_get: #Get Summary Data #First get MS1 Raw Data ms_level=1 return_data['ms1_summary'] = {} try: ms1_data = h5q.get_data(fid, ms_level=1, polarity=polarity, min_mz=mz_min, max_mz=mz_max, min_rt=rt_min, max_rt=rt_max) return_data['ms1_summary']['polarity'] = polarity return_data['ms1_summary']['mz_centroid'] = np.sum(np.multiply(ms1_data['i'],ms1_data['mz'])) / np.sum(ms1_data['i']) return_data['ms1_summary']['rt_centroid'] = np.sum(np.multiply(ms1_data['i'],ms1_data['rt'])) / np.sum(ms1_data['i']) idx = np.argmax(ms1_data['i']) return_data['ms1_summary']['mz_peak'] = ms1_data['mz'][idx] return_data['ms1_summary']['rt_peak'] = ms1_data['rt'][idx] return_data['ms1_summary']['peak_height'] = ms1_data['i'][idx] return_data['ms1_summary']['peak_area'] = np.sum(ms1_data['i']) except: return_data['ms1_summary']['polarity'] = [] return_data['ms1_summary']['mz_centroid'] = [] return_data['ms1_summary']['rt_centroid'] = [] return_data['ms1_summary']['mz_peak'] = [] return_data['ms1_summary']['rt_peak'] = [] return_data['ms1_summary']['peak_height'] = [] return_data['ms1_summary']['peak_area'] = [] if 'eic' in what_to_get: #Get Extracted Ion Chromatogram # TODO : If a person calls for summary, then they will already have the MS1 raw data return_data['eic'] = {} try: rt,intensity = h5q.get_chromatogram(fid, mz_min, mz_max, ms_level=ms_level, polarity=polarity, min_rt = rt_min - extra_time, max_rt = rt_max + extra_time) return_data['eic']['rt'] = rt return_data['eic']['intensity'] = intensity return_data['eic']['polarity'] = polarity except: return_data['eic']['rt'] = [] return_data['eic']['intensity'] = [] return_data['eic']['polarity'] = [] if '2dhist' in what_to_get: #Get 2D histogram of intensity values in m/z and retention time mzEdges = np.logspace(np.log10(100),np.log10(1000),10000) # mzEdges = np.linspace(mz_theor - 3, mz_theor + 30,100) #TODO : number of mz bins should be an optional parameter rtEdges = np.linspace(rt_min,rt_max,100) #TODO : number of rt bins should be an optional parameter. When not provided, it shoulddefauly to unique bins ms_level = 1 #TODO : ms_level should be a parameter return_data['2dhist'] = {} return_data['2dhist'] = h5q.get_heatmap(fid,mzEdges,rtEdges,ms_level,polarity) return_data['2dhist']['polarity'] = polarity if 'msms' in what_to_get: #Get Fragmentation Data ms_level=2 return_data['msms'] = {} try: fragmentation_data = h5q.get_data(fid, ms_level=ms_level, polarity=polarity, min_mz=0, max_mz=mz_theor+2,#TODO : this needs to be a parameter min_rt=rt_min, max_rt=rt_max, min_precursor_MZ=mz_min - 0.015, max_precursor_MZ=mz_max + 0.015) #Add the 0.01 because Thermo doesn't store accurate precursor m/z # min_precursor_intensity=0, #TODO : this needs to be a parameter # max_precursor_intensity=0,#TODO : this needs to be a parameter # min_collision_energy=0,#TODO : this needs to be a parameter # max_collision_energy=0)#TODO : this needs to be a parameter # prt,pmz = get_unique_scan_data(fragmentation_data) # rt_cutoff = 0.23 # mz_cutoff = 0.05 # list_of_prt,list_of_pmz = get_non_redundant_precursor_list(prt,pmz,rt_cutoff,mz_cutoff) # return_data['msms']['data'] = organize_msms_scan_data(fragmentation_data,list_of_prt,list_or_pmz) return_data['msms']['most_intense_precursor'] = retrieve_most_intense_msms_scan(fragmentation_data) return_data['msms']['data'] = fragmentation_data return_data['msms']['polarity'] = polarity except: return_data['msms']['most_intense_precursor'] = [] return_data['msms']['data'] = [] return_data['msms']['polarity'] = [] fid.close() #close the file return return_data
def create_pactolus_msms_data_container(myfiles,target_directory,min_intensity,min_rt = 1,max_rt = 22,make_container=True): # peak_arrayindex: This is a 2D array with the shape (num_spectra, 3). # The dataset contains an index that tells us: # i) the x location of each spectrum [:,0], # ii) the y location of each spectrum [:,1], and # iii) and the index where the spectrum starts in the peak_mz and peak_value array. # In item 1/2 I first fill the array with [0,i,0] values to define unique x/y locations # for each spectrum and in the second line I then create the last column with start index # of the spectra which is just the cumulative-sum of the length of the spectra. # when you create the start stop locations you will need to: # prepend [0] to the cummulative sums (the first spectrum starts at 0 not its length). # remove the last entry to make sure the array has the correct length # That is why I did the following: # np.cumsum([0] + [ ri['m/z array'].shape[0] for ri in good_list ])[:-1] if not os.path.exists(target_directory): try: os.makedirs(target_directory) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for myfile in myfiles: finfo = h5q.get_info(myfile) with tables.open_file(myfile) as fid: num_pos_data = finfo['ms1_pos']['nrows'] + finfo['ms2_pos']['nrows'] num_neg_data = finfo['ms1_neg']['nrows'] + finfo['ms2_neg']['nrows'] do_polarity = [] if num_pos_data > 0: do_polarity.append(1) if num_neg_data > 0: do_polarity.append(0) scan_polarity = [] for my_polarity in do_polarity: container_file = os.path.join(target_directory,'container_file_polarity_%d.h5'%(my_polarity)) if not os.path.isfile(container_file): make_container=True if make_container: data = h5q.get_data(fid,ms_level=2,polarity=my_polarity,min_rt = min_rt,max_rt=max_rt,min_precursor_intensity=min_intensity)#TODO: filter by intensity,) prt,pmz,pintensity = mgd.get_unique_scan_data(data) for i in range(len(pintensity)): scan_polarity.append(my_polarity) msms_data = mgd.organize_msms_scan_data(data,prt,pmz,pintensity) fpl = {} # peak_mz : This is a 1D arrays with m/z values for all the spectra stored as spectrum_1, spectrum_2 etc. fpl['peak_mz'] = np.concatenate(tuple( s[:,0] for s in msms_data['spectra']), axis = -1) # peak_value: This is a 1D arrays with the intensity values corresponding to the m/z values stored in peak_mz fpl['peak_value'] = np.concatenate(tuple( s[:,1] for s in msms_data['spectra']), axis = -1) fpl['precursor_mz'] = np.asarray(msms_data['precursor_mz']) fpl['peak_arrayindex'] = np.asarray([[0, i, 0] for i,rt in enumerate(prt)]) fpl['peak_arrayindex'][:,2] = np.cumsum([0] + [ s[:,0].shape[0] for s in msms_data['spectra'] ])[:-1] with h5py.File(container_file,'a') as output_file: group_name = os.path.basename(myfile) if group_name in list(output_file.keys()): output_file.__delitem__(group_name) # if group_name not in output_file.keys(): output_group = output_file.create_group(group_name) # else: # output_group = output_file[group_name] for key, value in six.iteritems(fpl): output_group[key] = value experiment_group = output_group.create_group('experiment_metadata') experiment_group['filename'] = group_name scan_group = output_group.create_group('scan_metadata') scan_group['peak_mz'] = np.asarray(msms_data['precursor_mz']) scan_group['peak_rt'] = np.asarray(msms_data['precursor_rt']) scan_group['peak_intensity'] = np.asarray(msms_data['precursor_intensity']) scan_group['polarity'] = np.asarray(scan_polarity) # 1 for pos and 0 for neg write_pactolus_job_file(myfile,container_file,my_polarity) return container_file
def test_get_data(): data = get_data(fid, min_rt=5, min_mz=100) assert np.allclose(data['i'].mean(), 7825.55387233) assert np.allclose(data['mz'][0], 100.979026794) assert np.allclose(data['rt'][0], 5.00666666031)