Esempio n. 1
0
def get_average_profile(dbfilepath, dataid=''):
    if os.path.isfile(dbfilepath):
        datasets = mh5.get_dataset_names(dbfilepath, dataset_names=[])
        if not datasets:
            printlog(dbfilepath + ' database file doesn'
                     't contain any MSI datasets')
            return
    else:
        printlog(str(dbfilepath) + ' database file is not found')
        return

    sizesp = mh5.load_dataset(dbfilepath, 'sizesp')
    sp_mean = np.zeros((sizesp[0], sizesp[1]))
    crt = mh5.load_dataset(dbfilepath, 'crt')

    #crt = crt / 60;

    j = -1

    dataidx = np.array([])
    for datasetid in datasets:
        j = j + 1
        try:
            sp = mh5.load_dataset(dbfilepath, datasetid + dataid)
            sp_mean = sp_mean + sp
            dataidx = np.append(dataidx, j)
        except Exception as inst:
            printlog(os.path.basename(datasetid) + ": Failed to readin")
            printlog(inst)

    dataidx = dataidx.astype(int)
    datasets = list(map(datasets.__getitem__, (dataidx)))
    sp_mean = sp_mean / len(dataidx)

    return sp_mean, crt, datasets
Esempio n. 2
0
 def save_proc_meta(dbfilepath,h5writepath,h5readpath):
     if h5writepath!=h5readpath:
         mzrange = mh5.load_dataset(dbfilepath, h5readpath + 'mzrange')
         rtrange = mh5.load_dataset(dbfilepath, h5readpath + 'rtrange') 
         cmz     = mh5.load_dataset(dbfilepath, h5readpath + 'cmz')
         crt     = mh5.load_dataset(dbfilepath, h5readpath + 'crt')
         sizesp  = mh5.load_dataset(dbfilepath, h5readpath + 'sizesp')
         
         mh5.save_dataset(dbfilepath, h5writepath + 'mzrange',data=mzrange)
         mh5.save_dataset(dbfilepath, h5writepath + 'rtrange',data=rtrange) 
         mh5.save_dataset(dbfilepath, h5writepath + 'cmz',data=cmz,compression_opts = 5)
         mh5.save_dataset(dbfilepath, h5writepath + 'crt',data=crt,compression_opts = 5)
         mh5.save_dataset(dbfilepath, h5writepath + 'sizesp',data=sizesp)
Esempio n. 3
0
 def get_refsp_h5(self,dbfilepath,datasets,h5readpath):
     
     with h5py.File(dbfilepath, 'r') as h5file:    
         i = 0
         printlog("\nPreparing reference profile for inter-sample retention time drift alignment across %s datasets from \n%s...\n" % (len(datasets),dbfilepath))
         dataindex = 0 
         for datasetid in datasets:
             dataindex = dataindex + 1        
             try:        
                 sp2D = mh5.load_dataset(h5file,h5readpath[:-1] + datasetid+ '/sp')
                 if i==0:
                     i = i + 1
                     ref2D = sp2D
                     continue
                 if self.reference=='mean':
                     ref2D = (sp2D + ref2D)                  
                 i = i + 1
                 printlog('%s. %s: Successfully updated from -> %s%s' %(dataindex, datasetid, os.path.basename(dbfilepath),h5readpath))
             except Exception as inst:
                 printlog('%s. %s: Failed' %(dataindex, datasetid))  
                 printlog(inst)
                 traceback.print_exc()
         
         if self.reference=='mean':
             self.ref2D = ref2D/i
Esempio n. 4
0
def do_mzalignment(dbfilepath,
                   method='bining',
                   params={
                       'binshift': 0.3,
                       'binsize': 1,
                       'units': 'Da',
                       'h5writepath': '/proc'
                   },
                   istrain=1):
    """
    Performs intra-sample correction of molecular m/z drifts  between scans of individual samples
    
    Args:
    
        dbfilepath: a user-specified path to the h5 database file
                    
        method: the method choice for intra-sample m/z drift corrections (e.g. bining by default)  
        
        params: dictionary of parameter arguments for the correction method (e.g. ``{'binshift': 0.3, 'binsize':1, 'units': 'Da'})`` for bining) 
             
        dbfilepath: processed intensitiy matrices  
    """

    dataset_names = mh5.get_dataset_names(dbfilepath,
                                          dataset_names=[],
                                          pathinh5='/raw')
    if not dataset_names:
        return
    else:
        params['h5writepath'] = h5Base.correct_h5path(params['h5writepath'])

    if istrain == 1:

        with h5py.File(dbfilepath, 'r') as h5file:
            cmzrange = mh5.load_dataset(h5file, '/raw/cmzrange')
            crtrange = mh5.load_dataset(h5file, '/raw/crtrange')

        #delete unnecessary variables and save into hdf5 database file
        if method == 'binning':
            mzAlignObj = Binmz(method, params, cmzrange, crtrange)
        mzAlignObj.save_procobj(dbfilepath, params['h5writepath'])

    elif istrain == 0:
        mzAlignObj = Binmz()
        mzAlignObj.load_procobj(dbfilepath, params['h5writepath'])

    mzAlignObj.bin_h5(dbfilepath, dataset_names)
Esempio n. 5
0
def get_data(dbfilepath, h5readpath):
    """ Extract data from the h5file and output as a dictionary of 'x', 'y', 'ids', and 'colors' for each sample """
    if h5readpath[0] != '/':
        h5readpath = '/'.join(['', h5readpath])

    if os.path.isfile(dbfilepath):
        datasets = mh5.get_dataset_names(dbfilepath,
                                         dataset_names=[],
                                         pathinh5=h5readpath)
        if not datasets:
            printlog(dbfilepath + ' database file doesn'
                     't contain any MSI datasets')
            return
    else:
        printlog(str(dbfilepath) + ' database file is not found')
        return

    sizesp = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'sizesp']))
    tics = np.zeros((sizesp[0], sizesp[2]))
    crt = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'crt']))
    j = -1
    dataidx = np.array([])
    for datasetid in datasets:
        j = j + 1
        try:
            sp = mh5.load_dataset(dbfilepath,
                                  ''.join([h5readpath, datasetid, '/sp']))
            tics[:, j] = np.sum(sp, axis=1)
            dataidx = np.append(dataidx, j)
        except Exception as inst:
            printlog(os.path.basename(datasetid) + ": Failed to readin")
            printlog(inst)
            traceback.print_exc()

    dataidx = dataidx.astype(int)
    datasets = list(map(datasets.__getitem__, (dataidx)))
    tics = tics[:, dataidx]
    nrows, ncols = tics.shape
    sp = {'x': [], 'y': [], 'id': [], 'color': []}
    for i in range(ncols):
        sp['x'].append(crt / 60)
        sp['y'].append(tics[:, i])
        sp['id'].append(datasets[i])
    sp['color'] = colorgenerator(ncols)
    return sp
Esempio n. 6
0
def filternoise_h5(dbfilepath, datasets, SmoothObject, BaselineObject, params):

    if (SmoothObject) and (BaselineObject):
        printlog('\n' + "Preparing for smoothing and baseline correction " +
                 os.path.basename(dbfilepath) + '\n')
        printstring = 'Successfully smoothed, baseline corrected and deposited ->'
    elif (SmoothObject):
        printlog('\n' + "Preparing for smoothing " +
                 os.path.basename(dbfilepath) + '\n')
        printstring = 'Successfully smoothed and deposited ->'
    elif (BaselineObject) == True:
        printlog('\n' + "Preparing for baseline correction " +
                 os.path.basename(dbfilepath) + '\n')
        printstring = 'Successfully baseline corrected and deposited ->'
    else:
        return

    with h5py.File(dbfilepath, 'a') as h5file:

        i = 0
        dataindex = 0
        for datasetid in datasets:
            dataindex = dataindex + 1
            try:
                sp2D = mh5.load_dataset(
                    h5file, params['h5readpath'][:-1] + datasetid + '/sp')
                if (SmoothObject):
                    sp2D = SmoothObject.fit(sp2D)
                if (BaselineObject):
                    sp2D = BaselineObject.fit(sp2D)

                mh5.save_dataset(h5file,params['h5writepath'][:-1] + datasetid +'/sp',\
                                 data=sp2D,compression_opts = 5)
                printlog('%s. %s: %s %s%s' %(dataindex, datasetid, printstring, \
                                          os.path.basename(dbfilepath),params['h5writepath']))
                i = i + 1

                target_gname = params['h5writepath'][:-1] + datasetid
                source_gname = params['h5readpath'][:-1] + datasetid

                wgroup = h5file[target_gname]
                sgroup = h5file[source_gname]

                wgroup.attrs['is_raw'] = False
                wgroup.attrs['is_OK'] = True
                wgroup.attrs['is_processed'] = True
                wgroup.attrs['is_continuous'] = True
                wgroup.attrs['is_sample_dataset'] = True
                wgroup.attrs['parent'] = np.string_(source_gname)
                mh5.copy_meta_over(sgroup, wgroup)

            except Exception as inst:
                printlog('%s. %s: %s' % (dataindex, datasetid, 'Failed'))
                printlog(inst)
                traceback.print_exc()
Esempio n. 7
0
def get_data(dbfilepath, h5readpath='sp2D'):
    if h5readpath[0] != '/':
        h5readpath = '/'.join(['', 'sp2D'])

    if os.path.isfile(dbfilepath):
        datasets = mh5.get_dataset_names(dbfilepath,
                                         dataset_names=[],
                                         pathinh5=h5readpath)
        if not datasets:
            print(dbfilepath + ' database file doesn'
                  't contain any MSI datasets')
            return
    else:
        print(str(dbfilepath) + ' database file is not found')
        return

    sizesp = mh5.load_dataset(dbfilepath, os.path.join(h5readpath, 'sizesp'))
    tics = np.zeros((sizesp[0], sizesp[2]))
    crt = mh5.load_dataset(dbfilepath, os.path.join(h5readpath, 'crt'))
    j = -1
    dataidx = np.array([])
    for datasetid in datasets:
        j = j + 1
        try:
            sp = mh5.load_dataset(dbfilepath,
                                  ''.join([h5readpath, datasetid, '/sp']))
            tics[:, j] = np.sum(sp, axis=1)
            dataidx = np.append(dataidx, j)
        except:
            print(os.path.basename(datasetid) + ": Failed to readin")
            pass
    dataidx = dataidx.astype(int)
    datasets = list(map(datasets.__getitem__, (dataidx)))
    tics = tics[:, dataidx]
    nrows, ncols = tics.shape
    sp = {'x': [], 'y': [], 'id': [], 'color': []}
    for i in range(ncols):
        sp['x'].append(crt)
        sp['y'].append(tics[:, i])
        sp['id'].append(datasets[i])
    sp['color'] = colorgenerator(ncols)
    return sp
Esempio n. 8
0
    def aling_h5(self,dbfilepath,datasets,h5readpath,h5writepath):
        
        #if not self.ref2D:
        
            
        printlog("\nPerforming internal sample retention time profile alignment across %s datasets from \n%s...\n" % (len(datasets),dbfilepath))
        dataindex = 0 
        with h5py.File(dbfilepath, 'a') as h5file:   
        
            #mh5.save_dataset(h5file, h5writepath + '/ref2D', data = self.ref2D, compression_opts = 5)
            
            for datasetid in datasets:
                dataindex = dataindex + 1                

                try:
                    sp2D = mh5.load_dataset(h5file, h5readpath[:-1] + datasetid + '/sp')
                    
                    nrt, nmz = sp2D.shape
                    
                    ref2D = np.mean(sp2D, axis = 1);
                    
                    mh5.save_dataset(h5file, h5writepath + datasetid.lstrip('/') + '/ref2D', data = ref2D, compression_opts = 5)
                    
                    for i in range(nmz):
                        alprof = self.align(sp2D[:, i], ref2D[:])
                        sp2D[:, i] = alprof
                        
                    mh5.save_dataset(h5file, h5writepath[:-1] + datasetid+ '/sp', data = sp2D, compression_opts = 5)
                    
                    printlog('%s. %s: Successfully aligned and deposited -> %s%s' %(dataindex, datasetid, os.path.basename(dbfilepath), h5writepath))
                    
                    target_gname = h5writepath[:-1] + datasetid;
                    source_gname = h5readpath[:-1] + datasetid;
        
                    wgroup = h5file[target_gname];
                    sgroup = h5file[source_gname];
                        
                    wgroup.attrs['is_raw'] = False;
                    wgroup.attrs['is_OK'] = True;
                    wgroup.attrs['is_processed'] = True;
                    wgroup.attrs['is_continuous'] = True;
                    wgroup.attrs['is_sample_dataset'] = True;
                    wgroup.attrs['parent'] = np.string_(source_gname)
                    mh5.copy_meta_over(sgroup, wgroup);
                    
                    
                except Exception as inst:
                    printlog('%s. %s: Failed to be deposited' %(dataindex, datasetid))  
                    printlog(inst)
                    traceback.print_exc()
Esempio n. 9
0
def do_profile_alignment(dbfilepath, method='rspa', params = {'recursion':1, 
                                                              'minsegwidth':100, 
                                                              'maxpeakshift':10,
                                                              'reference':'mean',
                                                              'h5readpath': '/proc',
                                                              'h5writepath': '/proc'},
                                                                istrain=1):
    """
    Performs advanced adjustment for chromatographic peak position variations at full profile resolution
    using recursive segment-wise peak alignment strategy
    
    Args:
    
        dbfilepath: The database file path
                    
        method: The choice of peak alignment method. Default value: 'rspa', i.e. Recursive segment-wise peak alignment.  
        
        params: The dictionary of peak alignment parameters
    """     
    
    params['h5writepath'] = h5Base.correct_h5path(params['h5writepath'])
    params['h5readpath']  = h5Base.correct_h5path(params['h5readpath'])
    dataset_names = mh5.get_dataset_names(dbfilepath,dataset_names=[],pathinh5 = params['h5readpath'][:-1])
    if not dataset_names:
        return

    peak_width = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'peak_width');
    printlog('Loaded min estimated peak width: %s seconds'%peak_width);
    mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\
                                 data=peak_width)

    if str(params['minsegwidth']).lower() == 'auto':
        params['minsegwidth'] = peak_width * 10.0;
        printlog('Parameter "minsegwidth" is set to %s'%params['minsegwidth']);
    else:
        try:
            params['minsegwidth'] = float(params['minsegwidth'])
        except:
            raise LoggingValueError('Error! %s value for parameter "minsegwidth" cannot be converted to float!'%params['minsegwidth'])
            

    
    if str(params['maxpeakshift']).lower() == 'auto':
        params['maxpeakshift'] = peak_width * 5;
        printlog('Parameter "maxpeakshift" is set to %s'%params['maxpeakshift']);
    else:
        try:
            params['maxpeakshift'] = float(params['maxpeakshift'])
        except:
            raise LoggingValueError('Error! %s value for parameter "maxpeakshift" cannot be converted to float!'%params['maxpeakshift'])
        

    if istrain==1:
        rtrange = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'rtrange') 
        if method=='rspa':
            rtAlObj = RSPA(method,params,rtrange)
            
    elif istrain==0:
            rtAlObj = RSPA()
            rtAlObj.load_procobj(dbfilepath,params['h5readpath'])
        
    rtAlObj.aling_h5(dbfilepath, dataset_names, params['h5readpath'] , params['h5writepath'])
    
    if istrain==1:
         #save into hdf5 database file
        rtAlObj.export()
        rtAlObj.save_procobj(dbfilepath,params['h5writepath'])    
        rtAlObj.save_proc_meta(dbfilepath,params['h5writepath'],params['h5readpath'])
                    
    return
Esempio n. 10
0
def do_noisefilter(dbfilepath,
                   smoothmethod='sqfilter',
                   smoothparams={
                       'window': 5,
                       'degree': 3
                   },
                   baselinemethod='tophat',
                   baselineparams={'frame': 90},
                   params={
                       'h5readpath': '/sp2D',
                       'h5writepath': '/spproc2D'
                   },
                   istrain=1):
    """
    Performs adjustment for high frequency noise and lower frequency baseline distortions 
    due to a variety of instrumental and experimental reasons
    
    Args:
    
        dbfilepath: a user-specified path to the h5 database file
                    
        smoothmethod: The type of noise filtering method. Default value: 'sqfilter', i.e. the Savitzky-Golay filter.  
        
        smoothparams: The dictionary of parameter arguments for noise filtering method 
        
        baselinemethod: The type of a baseline correction method. Default value: 'tophat', i.e. the top-hat morphological filter.
        
        baselineparams: The dictionary of parameter arguments for baseline correction method          
    """

    params['h5writepath'] = h5Base.correct_h5path(params['h5writepath'])
    params['h5readpath'] = h5Base.correct_h5path(params['h5readpath'])
    dataset_names = mh5.get_dataset_names(dbfilepath,
                                          dataset_names=[],
                                          pathinh5=params['h5readpath'][:-1])
    if not dataset_names:
        return

    peak_width = mh5.load_dataset(dbfilepath,
                                  params['h5readpath'] + 'peak_width')
    printlog('Loaded min estimated peak width: %s seconds' % peak_width)
    mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\
                                 data=peak_width)

    if str(smoothparams['window']).lower() == 'auto':
        smoothparams['window'] = peak_width * 0.5
        printlog('Parameter "window" is set to %s' % smoothparams['window'])
    else:
        try:
            smoothparams['window'] = float(smoothparams['window'])
        except:
            raise LoggingValueError(
                'Error! %s value for parameter "window" cannot be converted to float!'
                % smoothparams['window'])

    if str(baselineparams['frame']).lower() == 'auto':
        baselineparams['frame'] = peak_width * 15
        printlog('Parameter "frame" is set to %s' % baselineparams['frame'])
    else:
        try:
            baselineparams['frame'] = float(baselineparams['frame'])
        except:
            raise LoggingValueError(
                'Error! %s value for parameter "frame" cannot be converted to float!'
                % baselineparams['frame'])

    if istrain == 1:
        rtrange = mh5.load_dataset(dbfilepath,
                                   params['h5readpath'] + 'rtrange')
        if smoothmethod != 'none':
            SmoothObject = SmoothFilter(smoothmethod, smoothparams, rtrange)
        else:
            SmoothObject = []

        if baselinemethod != 'none':
            BaselineObject = BaselineFilter(baselinemethod, baselineparams,
                                            rtrange)
        else:
            BaselineObject = []

    elif istrain == 0:
        SmoothObject = SmoothFilter()
        SmoothObject.load_procobj(dbfilepath, params['h5readpath'])
        if SmoothObject.method == '':
            SmoothObject = []
        BaselineObject = BaselineFilter()
        if BaselineObject.method == '':
            BaselineObject = []
        BaselineObject.load_procobj(dbfilepath, params['h5readpath'])

    filternoise_h5(dbfilepath, dataset_names, SmoothObject, BaselineObject,
                   params)

    if istrain == 1:
        #save into hdf5 database file
        if (SmoothObject):
            SmoothObject.export(rtrange)
            SmoothObject.save_procobj(dbfilepath, params['h5writepath'])
        if (BaselineObject):
            BaselineObject.export(rtrange)
            BaselineObject.save_procobj(dbfilepath, params['h5writepath'])

        SmoothObject.save_proc_meta(dbfilepath, params['h5writepath'],
                                    params['h5readpath'])

    return
Esempio n. 11
0
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA

from proc.procconfig import PeakDetection_options

from proc.utils.cmdline import OptionsHolder
from proc.utils.signalproc import get_threshold
from proc.utils.timing import tic, toc
from proc.utils.msmanager import H5BaseMSIWorkflow as h5Base
from proc.utils.h5utils import h5write_strings
from proc.utils.printlog import printlog, start_log, stop_log

filename = 'i:/FTP/GCMS/MSV000080892/Results6/GC_data_for_Jamie.h5'

ncomp = 20
X_3D = mh5.load_dataset(filename, '/spproc2D_peakdetect' + '/X_3D')
X2D = X_3D[:, :, 24]
nmf_1 = NMF(n_components=ncomp, init='nndsvd')

W = nmf_1.fit_transform(X2D)  # integral spectra
H = nmf_1.components_  # quantity integrals (fragmentation patterns/spectra)

denom = np.sum(np.power(X2D, 2))
nmf_comp_variance = np.zeros(ncomp)

for i in range(ncomp):
    nmf_comp_variance[i] = 1 - np.sum(
        np.power(
            X2D - np.dot(np.matrix(W[:, i]).transpose(), np.matrix(H[i, :])),
            2)) / denom
Esempio n. 12
0
def get_data(dbfilepath, h5readpath):
    
    """ Extract data from the h5file and output as a dictionary of 'x', 'y', 'ids', and 'colors' for each sample """
    
    data = {'x': [], 'y': [], 'id': [], 'color': []}
    
    if h5readpath[0] != '/':
        h5readpath = '/'.join(['', h5readpath])

    if os.path.isfile(dbfilepath):
        datasets = mh5.get_dataset_names(dbfilepath, dataset_names=[], pathinh5=h5readpath)
        if not datasets:
            printlog(dbfilepath + ' database file doesn''t contain any MSI datasets')
            return
    else:
        printlog(str(dbfilepath) + ' database file is not found')
        return



    sizesp = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'sizesp']))
    crt = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'crt']))
    
    try:
        ref2D = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'ref2D']))
        print(ref2D.shape)
    except:
        ref2D = None;
    
    
    try:
        histc = mh5.load_dataset(dbfilepath, '/'.join([h5readpath+'_peakdetect', 'clustering_histogram']))
        print(np.min(histc))        
        print(np.max(histc))
    except:
        histc = None


    try:
        histc_threshold = mh5.load_dataset(dbfilepath, '/'.join([h5readpath+'_peakdetect', 'clustering_histogram_threshold']))
        print(np.min(histc_threshold))        
        print(np.max(histc_threshold))
    except:
        histc_threshold = None

    
    tics = np.zeros((sizesp[0], sizesp[2]))
    
    j = -1
    dataidx = np.array([])
    for datasetid in datasets:
        j = j + 1
        try:
            sp = mh5.load_dataset(dbfilepath, ''.join([h5readpath, datasetid, '/sp']))
            tics[:, j] = np.sum(sp, axis=1)
            dataidx = np.append(dataidx, j)
        except Exception as inst:
            printlog(os.path.basename(datasetid) + ": Failed to readin")
            printlog(inst)
            traceback.print_exc()
            
            
    dataidx = dataidx.astype(int)
    datasets = list(map(datasets.__getitem__, (dataidx)))
    tics = tics[:, dataidx]
    
    if not (histc is None):
        #histc[histc<=threshold] = 0
        '''
        if 'histc' in data:
            p.line(x = 'hx', y = 'histc', line_width=1, line_color = 'firebrick', source = source);
    
        if 'histc_threshold' in data:
            p.line(x = 'hx', y = 'histc_threshold', line_width = 2, line_color = 'navy', source = source);
            
        if 'ref2D' in data:
            p.line(x = 'refx', y = 'ref2D', line_width = 1, line_color = 'red', source = source)
            
        if 'picked_peaks' in data:
            p.circle(x = 'peak_x', y = 'picked_peaks', color = 'peak_color', size = 3, source = source)
    
        '''
        med_int = np.median(tics,axis=1).flatten()[0:-1]  + 1
        med_int = np.sqrt(med_int)
        histc =  med_int * histc       
        mv = np.max(tics.flatten())
        mv = mv/np.max(histc)
        threshold = median_threshold(histc);
        threshold *= mv
        histc *= mv;
        
        dd = int(len(histc)/100) - 1;
        
        dx = np.zeros((dd,), dtype = np.float64);
        dy = np.zeros((dd,), dtype = np.float64);
        
        for i in range(dd):
            dx[i] = np.mean(crt[i*100:i*100+100])
            dy[i] = median_threshold(histc[i*100:i*100+100])
       
        dy  = smooth1D(dx,dy,10) 
        dx[0]  = np.min(crt)
        dx[-1] = np.max(crt)
        fit = interp1d(dx,dy,kind='cubic')       
        fitted_threshold = fit(crt)
        

    nrows, ncols = tics.shape
    
    
    for i in range(ncols):
        data['x'].append(crt/60)
        data['y'].append(tics[:, i])
        data['id'].append(datasets[i])

    data['color'] = colorgenerator(ncols)
    
    
    return data
Esempio n. 13
0
    def bin_h5(self, dbfilepath, datasets):
        """
        Performs bining of GC-MS data to generate data intensity matrix [number of mz features x number of scans]
        common across all samples
        """

        with h5py.File(dbfilepath, 'a') as h5file:

            printlog(
                "\nPreparing for intra-sample m/z correction %s datasets from %s...\n"
                % (len(datasets), dbfilepath))
            dataindex = 0
            i = 0

            peak_width = 0.0
            dataset_count = 0

            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'cmz',
                             data=self.__binids,
                             compression_opts=5)
            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'crt',
                             data=self.__rtvals,
                             compression_opts=5)
            peak_finder = PeakFinder(dbfilepath, self.params['h5writepath'],
                                     '')

            for datasetid in datasets:
                dataindex = dataindex + 1
                try:
                    mzraw = mh5.load_dataset(h5file,
                                             '/raw' + datasetid + '/mz')
                    spraw = mh5.load_dataset(h5file,
                                             '/raw' + datasetid + '/sp')
                    scanidx = mh5.load_dataset(h5file,
                                               '/raw' + datasetid + '/scan')
                    rtraw = mh5.load_dataset(h5file,
                                             '/raw' + datasetid + '/time')
                    sp2D, cmz, crt = self.bin_sp(mzraw, spraw, scanidx, rtraw)
                    mh5.save_dataset(h5file,
                                     self.params['h5writepath'][:-1] +
                                     datasetid + '/sp',
                                     data=sp2D,
                                     compression_opts=5)
                    dataset_count += 1
                    peaks, npeaks = peak_finder.findpeaks_sp(np.sum(
                        sp2D, axis=1).flatten(),
                                                             gap=5)

                    if npeaks > 10:

                        threshold = median_threshold(peaks[0, :])

                        mask = peaks[0, :] >= threshold

                        ipeak_widths = peaks[10, mask]

                        if len(ipeak_widths) > 1:

                            sorted_peakwidths = ipeak_widths[np.argsort(
                                peaks[0, mask])]

                            slice_count = int(sorted_peakwidths.shape[0] / 10)
                            #print(slice_count)
                            if slice_count > 0:
                                quant = np.min(
                                    sorted_peakwidths[0:slice_count])
                            else:
                                quant = 0.0
                            #print(quant)
                            med = np.median(ipeak_widths) / 3.0
                            #print(med)

                            peak_width += max(med, quant)

                            i = i + 1
                        else:
                            printlog(
                                'No peaks passed threshold in %s. Skipping mean peak width estimation...'
                                % datasetid)

                    else:
                        printlog(
                            'Less than 10 peaks detected in %s. Skipping mean peak width estimation...'
                            % datasetid)

                    printlog(
                        '%s. %s: Successfully corrected and deposited -> %s%s'
                        % (dataindex, datasetid, os.path.basename(dbfilepath),
                           self.params['h5writepath']))

                    target_gname = self.params['h5writepath'][:-1] + datasetid
                    source_gname = '/raw' + datasetid

                    wgroup = h5file[target_gname]
                    sgroup = h5file[source_gname]

                    wgroup.attrs['is_raw'] = False
                    wgroup.attrs['is_OK'] = True
                    wgroup.attrs['is_processed'] = True
                    wgroup.attrs['is_continuous'] = True
                    wgroup.attrs['is_sample_dataset'] = True
                    wgroup.attrs['parent'] = np.string_(source_gname)
                    mh5.copy_meta_over(sgroup, wgroup)

                except Exception as inst:
                    printlog('%s. %s: Failed to be corrected' %
                             (dataindex, datasetid))
                    printlog(inst)
                    traceback.print_exc()

            peak_width = peak_width / i

            sizesp = np.array([len(crt), len(cmz), dataset_count])

            printlog('Estimated min rt peak width: %s sec or %.2f min' %
                     (peak_width, peak_width / 60.0))
            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'peak_width',
                             data=peak_width)
            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'mzrange',
                             data=self.mzrange)
            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'rtrange',
                             data=self.rtrange)
            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'sizesp',
                             data=sizesp)