def get_data_for_a_compound(mz_ref, rt_ref, what_to_get, h5file, extra_time):
    """
    A helper function to query the various metatlas data selection 
    commands for a compound defined in an experimental atlas.

    Parameters
    ----------
    MZref : a MetAtlas Object for a m/z reference Class
        this contains the m/z, m/z tolerance, and tolerance units to slice the m/z dimension
    RTref : a MetAtlas Object for a retention time reference Class
        this contains the rt min, max, peak, and units to slice the retention time dimension
    what_to_get : a list of strings
        this contains one or more of [ 'ms1_summary', 'eic', '2dhist', 'msms' ]
    h5_file : str
        Path to input_file
    polarity : int
        [0 or 1] for negative or positive ionzation
    
    Returns
    -------
    """
    #TODO : polarity should be handled in the experiment and not a loose parameter
    import numpy as np
    from metatlas import h5_query as h5q
    import tables

    #get a pointer to the hdf5 file
    fid = tables.open_file(h5file)  #TODO: should be a "with open:"

    if mz_ref.detected_polarity == 'positive':
        polarity = 1
    else:
        polarity = 0

    mz_theor = mz_ref.mz
    if mz_ref.mz_tolerance_units == 'ppm':  #convert to ppm
        ppm_uncertainty = mz_ref.mz_tolerance
    else:
        ppm_uncertainty = mz_ref.mz_tolerance / mz_ref.mz * 1e6

#     if 'min' in rt_ref.rt_units: #convert to seconds
#     rt_min = rt_ref.rt_min / 60
#     rt_max = rt_ref.rt_max / 60
#     else:
    rt_min = rt_ref.rt_min
    rt_max = rt_ref.rt_max

    mz_min = mz_theor - mz_theor * ppm_uncertainty / 1e6
    mz_max = mz_theor + mz_theor * ppm_uncertainty / 1e6

    return_data = {}

    if 'ms1_summary' in what_to_get:
        #Get Summary Data

        #First get MS1 Raw Data
        ms_level = 1
        return_data['ms1_summary'] = {}
        try:
            ms1_data = h5q.get_data(fid,
                                    ms_level=1,
                                    polarity=polarity,
                                    min_mz=mz_min,
                                    max_mz=mz_max,
                                    min_rt=rt_min,
                                    max_rt=rt_max)

            return_data['ms1_summary']['polarity'] = polarity
            return_data['ms1_summary']['mz_centroid'] = np.sum(
                np.multiply(ms1_data['i'], ms1_data['mz'])) / np.sum(
                    ms1_data['i'])
            return_data['ms1_summary']['rt_centroid'] = np.sum(
                np.multiply(ms1_data['i'], ms1_data['rt'])) / np.sum(
                    ms1_data['i'])
            idx = np.argmax(ms1_data['i'])
            return_data['ms1_summary']['mz_peak'] = ms1_data['mz'][idx]
            return_data['ms1_summary']['rt_peak'] = ms1_data['rt'][idx]
            return_data['ms1_summary']['peak_height'] = ms1_data['i'][idx]
            return_data['ms1_summary']['peak_area'] = np.sum(ms1_data['i'])
        except:
            return_data['ms1_summary']['polarity'] = []
            return_data['ms1_summary']['mz_centroid'] = []
            return_data['ms1_summary']['rt_centroid'] = []
            return_data['ms1_summary']['mz_peak'] = []
            return_data['ms1_summary']['rt_peak'] = []
            return_data['ms1_summary']['peak_height'] = []
            return_data['ms1_summary']['peak_area'] = []

    if 'eic' in what_to_get:
        #Get Extracted Ion Chromatogram
        # TODO : If a person calls for summary, then they will already have the MS1 raw data
        return_data['eic'] = {}
        try:
            rt, intensity = h5q.get_chromatogram(fid,
                                                 mz_min,
                                                 mz_max,
                                                 ms_level=ms_level,
                                                 polarity=polarity,
                                                 min_rt=rt_min - extra_time,
                                                 max_rt=rt_max + extra_time)
            return_data['eic']['rt'] = rt
            return_data['eic']['intensity'] = intensity
            return_data['eic']['polarity'] = polarity

        except:
            return_data['eic']['rt'] = []
            return_data['eic']['intensity'] = []
            return_data['eic']['polarity'] = []

    if '2dhist' in what_to_get:
        #Get 2D histogram of intensity values in m/z and retention time
        mzEdges = np.logspace(np.log10(100), np.log10(1000), 10000)
        #         mzEdges = np.linspace(mz_theor - 3, mz_theor + 30,100) #TODO : number of mz bins should be an optional parameter
        rtEdges = np.linspace(
            rt_min, rt_max, 100
        )  #TODO : number of rt bins should be an optional parameter. When not provided, it shoulddefauly to unique bins
        ms_level = 1  #TODO : ms_level should be a parameter
        return_data['2dhist'] = {}
        return_data['2dhist'] = h5q.get_heatmap(fid, mzEdges, rtEdges,
                                                ms_level, polarity)
        return_data['2dhist']['polarity'] = polarity

    if 'msms' in what_to_get:
        #Get Fragmentation Data
        ms_level = 2
        return_data['msms'] = {}
        try:
            fragmentation_data = h5q.get_data(
                fid,
                ms_level=ms_level,
                polarity=polarity,
                min_mz=0,
                max_mz=mz_theor + 2,  #TODO : this needs to be a parameter
                min_rt=rt_min,
                max_rt=rt_max,
                min_precursor_MZ=mz_min - 0.015,
                max_precursor_MZ=mz_max + 0.015
            )  #Add the 0.01 because Thermo doesn't store accurate precursor m/z
            #                     min_precursor_intensity=0, #TODO : this needs to be a parameter
            #                     max_precursor_intensity=0,#TODO : this needs to be a parameter
            #                     min_collision_energy=0,#TODO : this needs to be a parameter
            #                     max_collision_energy=0)#TODO : this needs to be a parameter
            #         prt,pmz = get_unique_scan_data(fragmentation_data)
            #         rt_cutoff = 0.23
            #         mz_cutoff = 0.05
            #         list_of_prt,list_of_pmz = get_non_redundant_precursor_list(prt,pmz,rt_cutoff,mz_cutoff)
            #         return_data['msms']['data'] = organize_msms_scan_data(fragmentation_data,list_of_prt,list_or_pmz)
            return_data['msms'][
                'most_intense_precursor'] = retrieve_most_intense_msms_scan(
                    fragmentation_data)
            return_data['msms']['data'] = fragmentation_data
            return_data['msms']['polarity'] = polarity
        except:
            return_data['msms']['most_intense_precursor'] = []
            return_data['msms']['data'] = []
            return_data['msms']['polarity'] = []

    fid.close()  #close the file
    return return_data
def get_data_for_a_compound(mz_ref,rt_ref,what_to_get,h5file,extra_time):
    """
    A helper function to query the various metatlas data selection 
    commands for a compound defined in an experimental atlas.

    Parameters
    ----------
    MZref : a MetAtlas Object for a m/z reference Class
        this contains the m/z, m/z tolerance, and tolerance units to slice the m/z dimension
    RTref : a MetAtlas Object for a retention time reference Class
        this contains the rt min, max, peak, and units to slice the retention time dimension
    what_to_get : a list of strings
        this contains one or more of [ 'ms1_summary', 'eic', '2dhist', 'msms' ]
    h5_file : str
        Path to input_file
    polarity : int
        [0 or 1] for negative or positive ionzation
    
    Returns
    -------
    """
    #TODO : polarity should be handled in the experiment and not a loose parameter
    import numpy as np
    from metatlas import h5_query as h5q
    import tables
    
    #get a pointer to the hdf5 file
    fid = tables.open_file(h5file) #TODO: should be a "with open:"

    if mz_ref.detected_polarity  == 'positive':
        polarity = 1
    else:
        polarity = 0
        
    
    mz_theor = mz_ref.mz
    if mz_ref.mz_tolerance_units  == 'ppm': #convert to ppm
        ppm_uncertainty = mz_ref.mz_tolerance
    else:
        ppm_uncertainty = mz_ref.mz_tolerance / mz_ref.mz * 1e6
        
#     if 'min' in rt_ref.rt_units: #convert to seconds
#     rt_min = rt_ref.rt_min / 60
#     rt_max = rt_ref.rt_max / 60
#     else:
    rt_min = rt_ref.rt_min
    rt_max = rt_ref.rt_max
        
    mz_min = mz_theor - mz_theor * ppm_uncertainty / 1e6
    mz_max = mz_theor + mz_theor * ppm_uncertainty / 1e6
    
    return_data = {}
    
    if 'ms1_summary' in what_to_get:
        #Get Summary Data
        
        #First get MS1 Raw Data
        ms_level=1
        return_data['ms1_summary'] = {}
        try:
            ms1_data = h5q.get_data(fid, 
                                     ms_level=1,
                                     polarity=polarity,
                                     min_mz=mz_min,
                                     max_mz=mz_max,
                                     min_rt=rt_min,
                                     max_rt=rt_max)

            return_data['ms1_summary']['polarity'] = polarity
            return_data['ms1_summary']['mz_centroid'] = np.sum(np.multiply(ms1_data['i'],ms1_data['mz'])) / np.sum(ms1_data['i'])
            return_data['ms1_summary']['rt_centroid'] = np.sum(np.multiply(ms1_data['i'],ms1_data['rt'])) / np.sum(ms1_data['i'])
            idx = np.argmax(ms1_data['i'])
            return_data['ms1_summary']['mz_peak'] = ms1_data['mz'][idx]
            return_data['ms1_summary']['rt_peak'] = ms1_data['rt'][idx]        
            return_data['ms1_summary']['peak_height'] = ms1_data['i'][idx]
            return_data['ms1_summary']['peak_area'] = np.sum(ms1_data['i'])
        except:
            return_data['ms1_summary']['polarity'] = []
            return_data['ms1_summary']['mz_centroid'] = []
            return_data['ms1_summary']['rt_centroid'] = []
            return_data['ms1_summary']['mz_peak'] = []
            return_data['ms1_summary']['rt_peak'] = []
            return_data['ms1_summary']['peak_height'] = []
            return_data['ms1_summary']['peak_area'] = []
    
    if 'eic' in what_to_get:
        #Get Extracted Ion Chromatogram
        # TODO : If a person calls for summary, then they will already have the MS1 raw data
        return_data['eic'] = {}
        try:
            rt,intensity = h5q.get_chromatogram(fid, mz_min, mz_max, ms_level=ms_level, polarity=polarity, min_rt = rt_min - extra_time, max_rt = rt_max + extra_time)
            return_data['eic']['rt'] = rt
            return_data['eic']['intensity'] = intensity
            return_data['eic']['polarity'] = polarity

        except:    
            return_data['eic']['rt'] = []
            return_data['eic']['intensity'] = []
            return_data['eic']['polarity'] = []
    
    if '2dhist' in what_to_get:
        #Get 2D histogram of intensity values in m/z and retention time
        mzEdges = np.logspace(np.log10(100),np.log10(1000),10000)
#         mzEdges = np.linspace(mz_theor - 3, mz_theor + 30,100) #TODO : number of mz bins should be an optional parameter
        rtEdges = np.linspace(rt_min,rt_max,100) #TODO : number of rt bins should be an optional parameter. When not provided, it shoulddefauly to unique bins
        ms_level = 1 #TODO : ms_level should be a parameter
        return_data['2dhist'] = {}
        return_data['2dhist'] = h5q.get_heatmap(fid,mzEdges,rtEdges,ms_level,polarity)
        return_data['2dhist']['polarity'] = polarity
    
    if 'msms' in what_to_get:
        #Get Fragmentation Data
        ms_level=2
        return_data['msms'] = {}
        try:
            fragmentation_data = h5q.get_data(fid, 
                                     ms_level=ms_level,
                                     polarity=polarity,
                                     min_mz=0,
                                     max_mz=mz_theor+2,#TODO : this needs to be a parameter
                                     min_rt=rt_min,
                                     max_rt=rt_max,
                                     min_precursor_MZ=mz_min - 0.015,
                                     max_precursor_MZ=mz_max + 0.015) #Add the 0.01 because Thermo doesn't store accurate precursor m/z
        #                     min_precursor_intensity=0, #TODO : this needs to be a parameter
        #                     max_precursor_intensity=0,#TODO : this needs to be a parameter
        #                     min_collision_energy=0,#TODO : this needs to be a parameter
        #                     max_collision_energy=0)#TODO : this needs to be a parameter
    #         prt,pmz = get_unique_scan_data(fragmentation_data)
    #         rt_cutoff = 0.23
    #         mz_cutoff = 0.05
    #         list_of_prt,list_of_pmz = get_non_redundant_precursor_list(prt,pmz,rt_cutoff,mz_cutoff)
    #         return_data['msms']['data'] = organize_msms_scan_data(fragmentation_data,list_of_prt,list_or_pmz)
            return_data['msms']['most_intense_precursor'] = retrieve_most_intense_msms_scan(fragmentation_data)
            return_data['msms']['data'] = fragmentation_data
            return_data['msms']['polarity'] = polarity
        except:
            return_data['msms']['most_intense_precursor'] = []
            return_data['msms']['data'] = []
            return_data['msms']['polarity'] = []
            
    fid.close() #close the file
    return return_data
Beispiel #3
0
def create_pactolus_msms_data_container(myfiles,target_directory,min_intensity,min_rt = 1,max_rt = 22,make_container=True):
    # peak_arrayindex: This is a 2D array with the shape (num_spectra, 3). 
    # The dataset contains an index that tells us: 
    # i) the x location of each spectrum [:,0], 
    # ii) the y location of each spectrum [:,1], and 
    # iii) and the index where the spectrum starts in the peak_mz and peak_value array. 
    # In item 1/2 I first fill the array with [0,i,0] values to define unique x/y locations 
    # for each spectrum and in the second line I then create the last column with start index
    # of the spectra which is just the cumulative-sum of the length of the spectra.
    # when you create the start stop locations you will need to:
    # prepend [0] to the cummulative sums (the first spectrum starts at 0 not its length).
    # remove the last entry to make sure the array has the correct length
    # That is why I did the following:
    # np.cumsum([0] + [ ri['m/z array'].shape[0] for ri in good_list ])[:-1]
    if not os.path.exists(target_directory):
        try:
            os.makedirs(target_directory)
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    for myfile in myfiles:
        finfo = h5q.get_info(myfile)
        with tables.open_file(myfile) as fid:    
            num_pos_data = finfo['ms1_pos']['nrows'] + finfo['ms2_pos']['nrows']
            num_neg_data = finfo['ms1_neg']['nrows'] + finfo['ms2_neg']['nrows']
            do_polarity = []
            if num_pos_data > 0:
                do_polarity.append(1)
            if num_neg_data > 0:
                do_polarity.append(0)
            scan_polarity = []
            for my_polarity in do_polarity:
                container_file = os.path.join(target_directory,'container_file_polarity_%d.h5'%(my_polarity))
                if not os.path.isfile(container_file):
                    make_container=True
                if make_container:
                    data = h5q.get_data(fid,ms_level=2,polarity=my_polarity,min_rt = min_rt,max_rt=max_rt,min_precursor_intensity=min_intensity)#TODO: filter by intensity,)
                    prt,pmz,pintensity = mgd.get_unique_scan_data(data)
                    for i in range(len(pintensity)):
                        scan_polarity.append(my_polarity)
                    msms_data = mgd.organize_msms_scan_data(data,prt,pmz,pintensity)
                    fpl = {}
                    # peak_mz : This is a 1D arrays with m/z values for all the spectra stored as spectrum_1, spectrum_2 etc.
                    fpl['peak_mz'] = np.concatenate(tuple( s[:,0] for s in msms_data['spectra']), axis = -1)
                    # peak_value: This is a 1D arrays with the intensity values corresponding to the m/z values stored in peak_mz
                    fpl['peak_value'] = np.concatenate(tuple( s[:,1] for s in msms_data['spectra']), axis = -1)
                    fpl['precursor_mz'] = np.asarray(msms_data['precursor_mz'])
                    fpl['peak_arrayindex'] = np.asarray([[0, i, 0] for i,rt in enumerate(prt)]) 
                    fpl['peak_arrayindex'][:,2] = np.cumsum([0] + [ s[:,0].shape[0] for s in msms_data['spectra'] ])[:-1]
                    with h5py.File(container_file,'a') as output_file:
                        group_name = os.path.basename(myfile)
                        if group_name in list(output_file.keys()):
                            output_file.__delitem__(group_name)
                #         if group_name not in output_file.keys():
                        output_group = output_file.create_group(group_name)
                #         else:
                #             output_group = output_file[group_name]
                        for key, value in six.iteritems(fpl):
                            output_group[key] = value
                        experiment_group = output_group.create_group('experiment_metadata')
                        experiment_group['filename'] = group_name
                        scan_group = output_group.create_group('scan_metadata')
                        scan_group['peak_mz'] = np.asarray(msms_data['precursor_mz'])
                        scan_group['peak_rt'] = np.asarray(msms_data['precursor_rt'])
                        scan_group['peak_intensity'] = np.asarray(msms_data['precursor_intensity'])
                        scan_group['polarity'] = np.asarray(scan_polarity) # 1 for pos and 0 for neg
                write_pactolus_job_file(myfile,container_file,my_polarity)
    return container_file
Beispiel #4
0
def test_get_data():
    data = get_data(fid, min_rt=5, min_mz=100)
    assert np.allclose(data['i'].mean(), 7825.55387233)
    assert np.allclose(data['mz'][0], 100.979026794)
    assert np.allclose(data['rt'][0], 5.00666666031)
Beispiel #5
0
def test_get_data():
    data = get_data(fid, min_rt=5, min_mz=100)
    assert np.allclose(data['i'].mean(), 7825.55387233)
    assert np.allclose(data['mz'][0], 100.979026794)
    assert np.allclose(data['rt'][0], 5.00666666031)