Beispiel #1
0
def filternoise_h5(dbfilepath, datasets, SmoothObject, BaselineObject, params):

    if (SmoothObject) and (BaselineObject):
        printlog('\n' + "Preparing for smoothing and baseline correction " +
                 os.path.basename(dbfilepath) + '\n')
        printstring = 'Successfully smoothed, baseline corrected and deposited ->'
    elif (SmoothObject):
        printlog('\n' + "Preparing for smoothing " +
                 os.path.basename(dbfilepath) + '\n')
        printstring = 'Successfully smoothed and deposited ->'
    elif (BaselineObject) == True:
        printlog('\n' + "Preparing for baseline correction " +
                 os.path.basename(dbfilepath) + '\n')
        printstring = 'Successfully baseline corrected and deposited ->'
    else:
        return

    with h5py.File(dbfilepath, 'a') as h5file:

        i = 0
        dataindex = 0
        for datasetid in datasets:
            dataindex = dataindex + 1
            try:
                sp2D = mh5.load_dataset(
                    h5file, params['h5readpath'][:-1] + datasetid + '/sp')
                if (SmoothObject):
                    sp2D = SmoothObject.fit(sp2D)
                if (BaselineObject):
                    sp2D = BaselineObject.fit(sp2D)

                mh5.save_dataset(h5file,params['h5writepath'][:-1] + datasetid +'/sp',\
                                 data=sp2D,compression_opts = 5)
                printlog('%s. %s: %s %s%s' %(dataindex, datasetid, printstring, \
                                          os.path.basename(dbfilepath),params['h5writepath']))
                i = i + 1

                target_gname = params['h5writepath'][:-1] + datasetid
                source_gname = params['h5readpath'][:-1] + datasetid

                wgroup = h5file[target_gname]
                sgroup = h5file[source_gname]

                wgroup.attrs['is_raw'] = False
                wgroup.attrs['is_OK'] = True
                wgroup.attrs['is_processed'] = True
                wgroup.attrs['is_continuous'] = True
                wgroup.attrs['is_sample_dataset'] = True
                wgroup.attrs['parent'] = np.string_(source_gname)
                mh5.copy_meta_over(sgroup, wgroup)

            except Exception as inst:
                printlog('%s. %s: %s' % (dataindex, datasetid, 'Failed'))
                printlog(inst)
                traceback.print_exc()
Beispiel #2
0
 def save_proc_meta(dbfilepath,h5writepath,h5readpath):
     if h5writepath!=h5readpath:
         mzrange = mh5.load_dataset(dbfilepath, h5readpath + 'mzrange')
         rtrange = mh5.load_dataset(dbfilepath, h5readpath + 'rtrange') 
         cmz     = mh5.load_dataset(dbfilepath, h5readpath + 'cmz')
         crt     = mh5.load_dataset(dbfilepath, h5readpath + 'crt')
         sizesp  = mh5.load_dataset(dbfilepath, h5readpath + 'sizesp')
         
         mh5.save_dataset(dbfilepath, h5writepath + 'mzrange',data=mzrange)
         mh5.save_dataset(dbfilepath, h5writepath + 'rtrange',data=rtrange) 
         mh5.save_dataset(dbfilepath, h5writepath + 'cmz',data=cmz,compression_opts = 5)
         mh5.save_dataset(dbfilepath, h5writepath + 'crt',data=crt,compression_opts = 5)
         mh5.save_dataset(dbfilepath, h5writepath + 'sizesp',data=sizesp)
    def aling_h5(self,dbfilepath,datasets,h5readpath,h5writepath):
        
        #if not self.ref2D:
        
            
        printlog("\nPerforming internal sample retention time profile alignment across %s datasets from \n%s...\n" % (len(datasets),dbfilepath))
        dataindex = 0 
        with h5py.File(dbfilepath, 'a') as h5file:   
        
            #mh5.save_dataset(h5file, h5writepath + '/ref2D', data = self.ref2D, compression_opts = 5)
            
            for datasetid in datasets:
                dataindex = dataindex + 1                

                try:
                    sp2D = mh5.load_dataset(h5file, h5readpath[:-1] + datasetid + '/sp')
                    
                    nrt, nmz = sp2D.shape
                    
                    ref2D = np.mean(sp2D, axis = 1);
                    
                    mh5.save_dataset(h5file, h5writepath + datasetid.lstrip('/') + '/ref2D', data = ref2D, compression_opts = 5)
                    
                    for i in range(nmz):
                        alprof = self.align(sp2D[:, i], ref2D[:])
                        sp2D[:, i] = alprof
                        
                    mh5.save_dataset(h5file, h5writepath[:-1] + datasetid+ '/sp', data = sp2D, compression_opts = 5)
                    
                    printlog('%s. %s: Successfully aligned and deposited -> %s%s' %(dataindex, datasetid, os.path.basename(dbfilepath), h5writepath))
                    
                    target_gname = h5writepath[:-1] + datasetid;
                    source_gname = h5readpath[:-1] + datasetid;
        
                    wgroup = h5file[target_gname];
                    sgroup = h5file[source_gname];
                        
                    wgroup.attrs['is_raw'] = False;
                    wgroup.attrs['is_OK'] = True;
                    wgroup.attrs['is_processed'] = True;
                    wgroup.attrs['is_continuous'] = True;
                    wgroup.attrs['is_sample_dataset'] = True;
                    wgroup.attrs['parent'] = np.string_(source_gname)
                    mh5.copy_meta_over(sgroup, wgroup);
                    
                    
                except Exception as inst:
                    printlog('%s. %s: Failed to be deposited' %(dataindex, datasetid))  
                    printlog(inst)
                    traceback.print_exc()
def do_profile_alignment(dbfilepath, method='rspa', params = {'recursion':1, 
                                                              'minsegwidth':100, 
                                                              'maxpeakshift':10,
                                                              'reference':'mean',
                                                              'h5readpath': '/proc',
                                                              'h5writepath': '/proc'},
                                                                istrain=1):
    """
    Performs advanced adjustment for chromatographic peak position variations at full profile resolution
    using recursive segment-wise peak alignment strategy
    
    Args:
    
        dbfilepath: The database file path
                    
        method: The choice of peak alignment method. Default value: 'rspa', i.e. Recursive segment-wise peak alignment.  
        
        params: The dictionary of peak alignment parameters
    """     
    
    params['h5writepath'] = h5Base.correct_h5path(params['h5writepath'])
    params['h5readpath']  = h5Base.correct_h5path(params['h5readpath'])
    dataset_names = mh5.get_dataset_names(dbfilepath,dataset_names=[],pathinh5 = params['h5readpath'][:-1])
    if not dataset_names:
        return

    peak_width = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'peak_width');
    printlog('Loaded min estimated peak width: %s seconds'%peak_width);
    mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\
                                 data=peak_width)

    if str(params['minsegwidth']).lower() == 'auto':
        params['minsegwidth'] = peak_width * 10.0;
        printlog('Parameter "minsegwidth" is set to %s'%params['minsegwidth']);
    else:
        try:
            params['minsegwidth'] = float(params['minsegwidth'])
        except:
            raise LoggingValueError('Error! %s value for parameter "minsegwidth" cannot be converted to float!'%params['minsegwidth'])
            

    
    if str(params['maxpeakshift']).lower() == 'auto':
        params['maxpeakshift'] = peak_width * 5;
        printlog('Parameter "maxpeakshift" is set to %s'%params['maxpeakshift']);
    else:
        try:
            params['maxpeakshift'] = float(params['maxpeakshift'])
        except:
            raise LoggingValueError('Error! %s value for parameter "maxpeakshift" cannot be converted to float!'%params['maxpeakshift'])
        

    if istrain==1:
        rtrange = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'rtrange') 
        if method=='rspa':
            rtAlObj = RSPA(method,params,rtrange)
            
    elif istrain==0:
            rtAlObj = RSPA()
            rtAlObj.load_procobj(dbfilepath,params['h5readpath'])
        
    rtAlObj.aling_h5(dbfilepath, dataset_names, params['h5readpath'] , params['h5writepath'])
    
    if istrain==1:
         #save into hdf5 database file
        rtAlObj.export()
        rtAlObj.save_procobj(dbfilepath,params['h5writepath'])    
        rtAlObj.save_proc_meta(dbfilepath,params['h5writepath'],params['h5readpath'])
                    
    return
Beispiel #5
0
def do_noisefilter(dbfilepath,
                   smoothmethod='sqfilter',
                   smoothparams={
                       'window': 5,
                       'degree': 3
                   },
                   baselinemethod='tophat',
                   baselineparams={'frame': 90},
                   params={
                       'h5readpath': '/sp2D',
                       'h5writepath': '/spproc2D'
                   },
                   istrain=1):
    """
    Performs adjustment for high frequency noise and lower frequency baseline distortions 
    due to a variety of instrumental and experimental reasons
    
    Args:
    
        dbfilepath: a user-specified path to the h5 database file
                    
        smoothmethod: The type of noise filtering method. Default value: 'sqfilter', i.e. the Savitzky-Golay filter.  
        
        smoothparams: The dictionary of parameter arguments for noise filtering method 
        
        baselinemethod: The type of a baseline correction method. Default value: 'tophat', i.e. the top-hat morphological filter.
        
        baselineparams: The dictionary of parameter arguments for baseline correction method          
    """

    params['h5writepath'] = h5Base.correct_h5path(params['h5writepath'])
    params['h5readpath'] = h5Base.correct_h5path(params['h5readpath'])
    dataset_names = mh5.get_dataset_names(dbfilepath,
                                          dataset_names=[],
                                          pathinh5=params['h5readpath'][:-1])
    if not dataset_names:
        return

    peak_width = mh5.load_dataset(dbfilepath,
                                  params['h5readpath'] + 'peak_width')
    printlog('Loaded min estimated peak width: %s seconds' % peak_width)
    mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\
                                 data=peak_width)

    if str(smoothparams['window']).lower() == 'auto':
        smoothparams['window'] = peak_width * 0.5
        printlog('Parameter "window" is set to %s' % smoothparams['window'])
    else:
        try:
            smoothparams['window'] = float(smoothparams['window'])
        except:
            raise LoggingValueError(
                'Error! %s value for parameter "window" cannot be converted to float!'
                % smoothparams['window'])

    if str(baselineparams['frame']).lower() == 'auto':
        baselineparams['frame'] = peak_width * 15
        printlog('Parameter "frame" is set to %s' % baselineparams['frame'])
    else:
        try:
            baselineparams['frame'] = float(baselineparams['frame'])
        except:
            raise LoggingValueError(
                'Error! %s value for parameter "frame" cannot be converted to float!'
                % baselineparams['frame'])

    if istrain == 1:
        rtrange = mh5.load_dataset(dbfilepath,
                                   params['h5readpath'] + 'rtrange')
        if smoothmethod != 'none':
            SmoothObject = SmoothFilter(smoothmethod, smoothparams, rtrange)
        else:
            SmoothObject = []

        if baselinemethod != 'none':
            BaselineObject = BaselineFilter(baselinemethod, baselineparams,
                                            rtrange)
        else:
            BaselineObject = []

    elif istrain == 0:
        SmoothObject = SmoothFilter()
        SmoothObject.load_procobj(dbfilepath, params['h5readpath'])
        if SmoothObject.method == '':
            SmoothObject = []
        BaselineObject = BaselineFilter()
        if BaselineObject.method == '':
            BaselineObject = []
        BaselineObject.load_procobj(dbfilepath, params['h5readpath'])

    filternoise_h5(dbfilepath, dataset_names, SmoothObject, BaselineObject,
                   params)

    if istrain == 1:
        #save into hdf5 database file
        if (SmoothObject):
            SmoothObject.export(rtrange)
            SmoothObject.save_procobj(dbfilepath, params['h5writepath'])
        if (BaselineObject):
            BaselineObject.export(rtrange)
            BaselineObject.save_procobj(dbfilepath, params['h5writepath'])

        SmoothObject.save_proc_meta(dbfilepath, params['h5writepath'],
                                    params['h5readpath'])

    return
Beispiel #6
0
    def mzxml_reader(self, filelist):
        """
        Performs reading chromatography-mass spectrometry data from NETCDF files

        Args:

            filelist: the list of files

        """
        import mzxml

        hf = h5py.File(os.path.join(self.dbpath, self.dbname), 'w')
        minmass = np.inf
        maxmass = 0.
        minrt = np.inf
        maxrt = 0.
        resrt = -1.

        print(" \nReading mzXML files from '%s'...\n" % (self.dbpath))

        #print(filelist)

        # loop through the files
        fileindex = 0
        for filepath in filelist:
            fileindex = fileindex + 1

            try:
                mzXML_scans = mzxml.load_mzxml_file(filepath)
                #raise Exception('Not available yet due to issues.')
            except Exception as inst:
                printlog(inst)
                traceback.print_exc()
                print("%s: Cannot open file" % filepath)

                continue

            try:
                mass_values = np.array([])
                intensity_values = np.array([])
                time_list_list = []
                for scan in mzXML_scans:
                    time_list_list.append(scan.retention_time)
                    scan_masses_as_list, scan_intensities_as_list = zip(
                        *scan.peaks)
                    mass_values = np.concatenate(
                        (mass_values, np.asarray(scan_masses_as_list)))
                    intensity_values = np.concatenate(
                        (intensity_values,
                         np.asarray(scan_intensities_as_list)))

                time_list = np.asarray(time_list_list)
                minmass = np.min([minmass, np.min(mass_values)])
                maxmass = np.max([maxmass, np.max(mass_values)])

                if not len(mass_values) == len(intensity_values):
                    print(
                        "The length of mass_list is not equal to the length of intensity_list. Data not deposited!... !"
                    )
                    continue

                if np.median(np.diff(mass_values)) < 0:
                    # descending order arrangment of m/z values
                    scanidx = np.diff(mass_values) >= 0
                    descend = 1
                else:
                    # ascending order arrangment of m/z values
                    scanidx = np.diff(mass_values) <= 0
                    descend = 0

                # calculate the number of scans
                scanidx = np.append(scanidx, True)
                nscans = np.sum(scanidx)
                scanidx = np.where(scanidx == True)
                scanidx = scanidx[0][np.arange(0, nscans)]

                # pre-allocate arrays
                scan_start = np.array([]).astype(int)
                scan_end = np.array([]).astype(int)

                # arrange data for hdf5 database file storage
                istart = 0
                for iend in scanidx:
                    imass_values = mass_values[np.arange(istart, iend + 1)]
                    iintensity_values = intensity_values[np.arange(
                        istart, iend + 1)]
                    if len(imass_values) > 0:
                        if descend == 1:
                            mass_values[np.arange(istart, iend +
                                                  1)] = imass_values[::-1]
                            intensity_values[np.arange(
                                istart, iend + 1)] = iintensity_values[::-1]
                        scan_start = np.append(scan_start, istart)
                        scan_end = np.append(scan_end, iend)
                    istart = iend + 1

                # arrange data for hdf5 database file storage
                scan_indlist = np.vstack((scan_start, scan_end))
                if np.median(np.diff(time_list)) < 0:
                    time_list = time_list[::-1]
                minrt = np.min([minrt, np.min(time_list)])
                maxrt = np.max([maxrt, np.max(time_list)])
                if resrt == -1:
                    resrt = np.median(np.diff(time_list))
                else:
                    resrt = 0.5 * (np.median(np.diff(time_list)) + resrt)

                # sanity check
                if not len(time_list) == len(scan_start):
                    print(
                        "The number of time points (%d) does not equal the number of scans (%d). Data not deposited!"
                        % (len(time_list), len(scan_start)))
                    continue
                else:
                    # deposit data into hdf5 database file
                    dpath = 'raw/' + os.path.splitext(
                        os.path.basename(filepath))[0]
                    try:
                        ginfo = hf.create_group(dpath)
                        try:
                            m5db.save_dataset(ginfo,
                                              'mz',
                                              data=mass_values,
                                              compression_opts=5)
                            m5db.save_dataset(ginfo,
                                              'sp',
                                              data=intensity_values,
                                              compression_opts=5)
                            m5db.save_dataset(ginfo,
                                              'time',
                                              data=time_list * self.time_mult)
                            m5db.save_dataset(ginfo, 'scan', data=scan_indlist)

                            # chunks=(nRows, nCols, 1))
                            print('%s. %s: Successfully deposited -> %s' %
                                  (fileindex, os.path.basename(filepath),
                                   self.dbname))

                        except:
                            print('%s. %s: Failed to deposit' %
                                  (fileindex, os.path.basename(filepath)))
                    except:
                        print(
                            '%s. %s: All files must have unique names: Failed to create a dataset in -> %s'
                            % (fileindex, os.path.basename(filepath),
                               self.dbname))
            except:
                raise
                print('%s. %s: Failed to read in data' %
                      (fileindex, os.path.basename(filepath)))
        m5db.save_dataset(hf,
                          'raw/cmzrange',
                          data=[np.floor(minmass),
                                np.ceil(maxmass)])
        m5db.save_dataset(hf,
                          'raw/crtrange',
                          data=[
                              np.floor(minrt * self.time_mult),
                              resrt * self.time_mult,
                              np.ceil(maxrt * self.time_mult)
                          ])

        printlog('Minimal mz: %s' % np.floor(minmass))
        printlog('Maximal mz: %s' % np.ceil(maxmass))
        printlog('Minimal rt (sec): %s' % np.floor(minrt * self.time_mult))
        printlog('Maximal rt (sec): %s' % np.ceil(maxrt * self.time_mult))
Beispiel #7
0
    def mzml_reader(self, filelist):
        """
        Performs reading chromatography-mass spectrometry data from mzML files

        Args:

            filelist: the list of files

        """
        import pymzml as pyml
        hf = h5py.File(os.path.join(self.dbpath, self.dbname), 'w')
        minmass = np.inf
        maxmass = 0.
        minrt = np.inf
        maxrt = 0.
        resrt = -1.

        printlog(" \nReading mzML files from '%s'...\n" % (self.dbpath))
        # loop through the files
        fileindex = 0
        for filepath in filelist:
            fileindex = fileindex + 1
            try:
                dataset = pyml.run.Reader(filepath)
            except Exception as inst:
                printlog("%s: Cannot open file" % filepath)
                printlog(inst)
                traceback.print_exc()
                continue

            try:
                # pre-allocate arrays (faster than appending)
                n_scans = 0
                n_values = 0
                for sp in dataset:
                    if 'MS:1000511' in sp:  # (code 'MS:1000511' for ms level)
                        if sp['MS:1000511'] == 1:  # ms level 1 only
                            if 'MS:1000016' in sp:  # (code 'MS:1000016' for retention time value)
                                X = np.array(sp.centroidedPeaks).astype(float)
                                if len(X) > 0 and len(X.shape) == 2:
                                    healthcheck = np.logical_and(
                                        np.all(np.isfinite(X)),
                                        np.all(np.isreal(X)))
                                    if healthcheck:
                                        n_values = n_values + len(X)
                                        n_scans = n_scans + 1
                                    else:
                                        printlog(
                                            'Slice with numerical errors. Skipping...'
                                        )
                                else:
                                    printlog('Empty slice. Skipping...')

                scan_start = np.zeros(n_scans).astype(int)
                scan_end = np.zeros(n_scans).astype(int)
                mass_values = np.zeros(n_values).astype(float)
                intensity_values = np.zeros(n_values).astype(float)
                time_list = np.zeros(n_scans).astype(float)

                istart = 0
                iscan = -1
                dataset = pyml.run.Reader(filepath)
                for sp in dataset:
                    if 'MS:1000511' in sp:  # (code 'MS:1000511' for ms level)
                        if sp['MS:1000511'] == 1:  # ms level 1 only
                            if 'MS:1000016' in sp:  # (code 'MS:1000016' for retention time value)
                                X = np.array(sp.centroidedPeaks).astype(float)
                                if len(X) > 0 and len(X.shape) == 2:
                                    healthcheck = np.logical_and(
                                        np.all(np.isfinite(X)),
                                        np.all(np.isreal(X)))
                                    if healthcheck:

                                        imass_values = X[:, 0]
                                        iintensity_values = X[:, 1]

                                        iscan = iscan + 1
                                        if np.median(
                                                np.diff(imass_values)) < 0:
                                            imass_values = imass_values[::-1]
                                            iintensity_values = iintensity_values[::
                                                                                  -1]

                                        scan_start[iscan] = istart
                                        iend = istart + len(imass_values) - 1
                                        scan_end[iscan] = iend

                                        mass_values[np.arange(
                                            istart,
                                            iend + 1)] = imass_values[::-1]
                                        intensity_values[np.arange(
                                            istart, iend +
                                            1)] = iintensity_values[::-1]
                                        istart = iend + 1

                                        time_list[iscan] = sp['MS:1000016']

                                    else:
                                        printlog(
                                            'Slice with numerical errors. Skipping...'
                                        )
                                else:
                                    printlog('Empty slice. Skipping...')

                # arrange data for hdf5 database file storage
                scan_indlist = np.vstack((scan_start, scan_end))

                # in case data are in descending order
                if np.median(np.diff(time_list)) < 0:
                    time_list = time_list[::-1]

                # update retention time range and resolution
                minrt = np.min([minrt, np.min(time_list)])
                maxrt = np.max([maxrt, np.max(time_list)])
                if resrt == -1:
                    resrt = np.median(np.diff(time_list))
                else:
                    resrt = 0.5 * (np.median(np.diff(time_list)) + resrt)

                # update mass range
                minmass = np.min([minmass, np.min(mass_values)])
                maxmass = np.max([maxmass, np.max(mass_values)])

                # sanity check
                if not len(time_list) == len(scan_start):
                    printlog(
                        "The number of time points (%d) does not equal the number of scans (%d). Data not deposited!"
                        % (len(time_list), len(scan_start)))
                    continue
                else:
                    # deposit data into hdf5 database file
                    dpath = 'raw/' + os.path.splitext(
                        os.path.basename(filepath))[0]
                    try:
                        ginfo = hf.create_group(dpath)
                        try:
                            m5db.save_dataset(ginfo,
                                              'mz',
                                              data=mass_values,
                                              compression_opts=5)
                            m5db.save_dataset(ginfo,
                                              'sp',
                                              data=intensity_values,
                                              compression_opts=5)
                            m5db.save_dataset(ginfo,
                                              'time',
                                              data=(time_list *
                                                    self.time_mult))
                            m5db.save_dataset(ginfo, 'scan', data=scan_indlist)

                            # chunks=(nRows, nCols, 1))
                            printlog('%s. %s: Successfully deposited -> %s' %
                                     (fileindex, os.path.basename(filepath),
                                      self.dbname))
                            ginfo.attrs['is_raw'] = True
                            ginfo.attrs['is_OK'] = True
                            ginfo.attrs['is_processed'] = False
                            ginfo.attrs['is_continuous'] = True
                            ginfo.attrs['is_sample_dataset'] = True
                            #TODO: properly treat test and training data
                            #ginfo.attrs['is_training'] = True;

                        except Exception as inst:
                            printlog('%s. %s: Failed to deposit' %
                                     (fileindex, os.path.basename(filepath)))
                            printlog(inst)
                            traceback.print_exc()

                    except Exception as inst:
                        printlog(
                            '%s. %s: All files must have unique names: Failed to create a dataset in -> %s'
                            % (fileindex, os.path.basename(filepath),
                               self.dbname))
                        printlog(inst)
                        traceback.print_exc()

            except Exception as inst:
                printlog('%s. %s: Failed to read in data' %
                         (fileindex, os.path.basename(filepath)))
                printlog(inst)
                traceback.print_exc()

        m5db.save_dataset(hf,
                          'raw/cmzrange',
                          data=[np.floor(minmass),
                                np.ceil(maxmass)])
        m5db.save_dataset(hf,
                          'raw/crtrange',
                          data=[
                              np.floor(minrt * self.time_mult),
                              resrt * self.time_mult,
                              np.ceil(maxrt * self.time_mult)
                          ])

        printlog('Minimal mz: %s' % np.floor(minmass))
        printlog('Maximal mz: %s' % np.ceil(maxmass))
        printlog('Minimal rt (sec): %s' % np.floor(minrt * self.time_mult))
        printlog('Maximal rt (sec): %s' % np.ceil(maxrt * self.time_mult))
Beispiel #8
0
    def netcdf_reader(self, filelist):
        """
        Performs reading chromatography-mass spectrometry data from NETCDF files

        Args:

            filelist: the list of files

        """
        from netCDF4 import Dataset
        hf = h5py.File(os.path.join(self.dbpath, self.dbname), 'w')
        minmass = np.inf
        maxmass = 0.
        minrt = np.inf
        maxrt = 0.
        resrt = -1.

        printlog(" \nReading netCDF files from '%s'...\n" % (self.dbpath))
        # loop through the files
        fileindex = 0
        for filepath in filelist:
            fileindex = fileindex + 1

            try:
                file = Dataset(filepath)
            except Exception as inst:
                printlog("%s: Cannot open file" % filepath)
                printlog(inst)
                traceback.print_exc()
                continue

            try:
                mass_values = np.array(file.variables[self.__mass_string][:])
                intensity_values = np.array(
                    file.variables[self.__intensity_string][:])
                time_values = np.array(file.variables[self.__time_string][:])
                scan_values = np.array(file.variables[self.__scan_string][:])

                minmass = np.min([minmass, np.min(mass_values)])
                maxmass = np.max([maxmass, np.max(mass_values)])

                if not len(mass_values) == len(intensity_values):
                    printlog(
                        "The length of mass_list is not equal to the length of intensity_list. Data not deposited!... !"
                    )
                    continue

                if not len(time_values) == len(scan_values):
                    printlog(
                        "The length of scan_aquisition_time is not equal to the length of scan_index. Data not deposited!... !"
                    )
                    continue

                #removing empty scans
                dd = np.diff(scan_values) != 0
                dd = np.append(dd, True)
                ddi = np.arange(scan_values.shape[0], dtype=np.int64)[dd]
                time_values = time_values[ddi]
                scan_values = scan_values[ddi]

                #if np.median(np.diff(mass_values))<0:

                if np.median(np.diff(time_values)) < 0:
                    printlog(
                        'Decreasing scan time values detected! Data not deposited!...!'
                    )

                minrt = np.min([minrt, np.min(time_values)])
                maxrt = np.max([maxrt, np.max(time_values)])

                if resrt == -1:
                    resrt = np.median(np.diff(time_values))
                else:
                    resrt = 0.5 * (np.median(np.diff(time_values)) + resrt)

                scan_end_values = np.append(scan_values[1:] - 1,
                                            mass_values.shape[0] - 1)

                scan_indcs = np.vstack([scan_values, scan_end_values])

                dpath = 'raw/' + os.path.splitext(
                    os.path.basename(filepath))[0]
                try:
                    ginfo = hf.create_group(dpath)
                    try:
                        m5db.save_dataset(ginfo,
                                          'mz',
                                          data=mass_values,
                                          compression_opts=5)
                        m5db.save_dataset(ginfo,
                                          'sp',
                                          data=intensity_values,
                                          compression_opts=5)
                        m5db.save_dataset(ginfo,
                                          'time',
                                          data=(time_values * self.time_mult))
                        m5db.save_dataset(ginfo, 'scan', data=scan_indcs)

                        # chunks=(nRows, nCols, 1))
                        printlog('%s. %s: Successfully deposited -> %s' %
                                 (fileindex, os.path.basename(filepath),
                                  self.dbname))
                        ginfo.attrs['is_raw'] = True
                        ginfo.attrs['is_OK'] = True
                        ginfo.attrs['is_processed'] = False
                        ginfo.attrs['is_continuous'] = True
                        ginfo.attrs['is_sample_dataset'] = True
                        #TODO: properly treat test and training data
                        #ginfo.attrs['is_training'] = True;

                    except Exception as inst:
                        printlog('%s. %s: Failed to deposit' %
                                 (fileindex, os.path.basename(filepath)))
                        printlog(inst)
                        traceback.print_exc()

                except Exception as inst:
                    printlog(
                        '%s. %s: All files must have unique names: Failed to create a dataset in -> %s'
                        % (fileindex, os.path.basename(filepath), self.dbname))
                    printlog(inst)
                    traceback.print_exc()
                '''
                

                # calculate the number of scans
                scanidx = np.append(scanidx,True)
                nscans  = np.sum(scanidx)
                scanidx = np.where(scanidx==True)
                scanidx = scanidx[0][np.arange(0,nscans)]

                # pre-allocate arrays
                scan_start = np.array([]).astype(int)
                scan_end = np.array([]).astype(int)

                # arrange data for hdf5 database file storage
                istart  = 0
                for iend in scanidx:
                    imass_values = mass_values[np.arange(istart,iend+1)]
                    iintensity_values = intensity_values[np.arange(istart,iend+1)]
                    if len(imass_values)>0:
                        if descend==1:
                            mass_values[np.arange(istart,iend+1)] = imass_values[::-1]
                            intensity_values[np.arange(istart,iend+1)] = iintensity_values[::-1]
                        scan_start = np.append(scan_start,istart)
                        scan_end   = np.append(scan_end,iend)
                    istart = iend+1

                # arrange data for hdf5 database file storage
                scan_indlist = np.vstack((scan_start,scan_end))
                time_list    = np.array(file.variables[self.__time_string][:])
                if np.median(np.diff(time_list))<0:
                    time_list    = time_list[::-1]

                # sanity check
                if not len(time_list) == len(scan_start):
                    printlog("The number of time points (%d) does not equal the number of scans (%d). Data not deposited!"%(len(time_list), len(scan_start)))
                    continue
                else:
                    # deposit data into hdf5 database file
                    dpath   = 'raw/' + os.path.splitext(os.path.basename(filepath))[0]
                    try:
                        ginfo    = hf.create_group(dpath)
                        try:
                            m5db.save_dataset(ginfo,'mz', data = mass_values,compression_opts = 5)
                            m5db.save_dataset(ginfo,'sp', data = intensity_values, compression_opts = 5)
                            m5db.save_dataset(ginfo,'time',data = (time_list * self.time_mult) )
                            m5db.save_dataset(ginfo,'scan',data = scan_indlist)

                           # chunks=(nRows, nCols, 1))
                            printlog('%s. %s: Successfully deposited -> %s' %(fileindex, os.path.basename(filepath), self.dbname))
                            ginfo.attrs['is_raw'] = True;
                            ginfo.attrs['is_OK'] = True;
                            ginfo.attrs['is_processed'] = False;
                            ginfo.attrs['is_continuous'] = True;
                            ginfo.attrs['is_sample_dataset'] = True;
                            #TODO: properly treat test and training data
                            #ginfo.attrs['is_training'] = True;


                        except Exception as inst:
                            printlog('%s. %s: Failed to deposit' %(fileindex, os.path.basename(filepath)))
                            printlog(inst)
                            traceback.print_exc()

                    except Exception as inst:
                        printlog('%s. %s: All files must have unique names: Failed to create a dataset in -> %s' %(fileindex, os.path.basename(filepath), self.dbname))
                        printlog(inst)
                        traceback.print_exc()
                '''
            except Exception as inst:
                printlog('%s. %s: Failed to read in data' %
                         (fileindex, os.path.basename(filepath)))
                printlog(inst)
                traceback.print_exc()

        m5db.save_dataset(hf,
                          'raw/cmzrange',
                          data=[np.floor(minmass),
                                np.ceil(maxmass)])
        m5db.save_dataset(hf,
                          'raw/crtrange',
                          data=[
                              np.floor(minrt * self.time_mult),
                              resrt * self.time_mult,
                              np.ceil(maxrt * self.time_mult)
                          ])

        printlog('Minimal mz: %s' % np.floor(minmass))
        printlog('Maximal mz: %s' % np.ceil(maxmass))
        printlog('Minimal rt (sec): %s' % np.floor(minrt * self.time_mult))
        printlog('Maximal rt (sec): %s' % np.ceil(maxrt * self.time_mult))
Beispiel #9
0
def reconstruct_datasets(dbfilename, quantity_integrals, fragments,
                         h5writepath, h5fullprofile, delta_mz, delta_rt):

    if len(fragments[0]) != len(quantity_integrals[0]):
        printlog(
            'RT lists between quantity integrals and fragmentation patterns do not match! %s vs %s'
            % (len(quantity_integrals[0]), len(fragments[0])))
        return

    #rts, dataset_names, data_values, mzs, intfrac, deconvoluted
    (rts, dataset_names_list, data_values, imzs, intfrac,
     deconvoluted) = quantity_integrals

    dd = np.abs(np.subtract(fragments[0], quantity_integrals[0]))
    mask = dd > delta_rt
    dmismatch = np.sum(mask)
    if dmismatch > 0:
        printlog(
            'RT lists between quantity integrals and fragmentation patterns do not match withing %.3f tolerance!'
        )
        dif1 = fragments[0][mask]
        dif2 = quantity_integrals[0][mask]
        indx = np.arange(1, mask.shape[0] + 1, dtype=np.int64)[mask]
        for i in range(dif1.shape[0]):
            printlog(
                'Mismatch in\t%s:\tQI_RT:\t%.3f\tFP_RT:\t%.3f\tAllowed tolerance:\t%.3f'
                % (indx[i], dif2[i], dif1[i], delta_rt))

    with h5py.File(dbfilename, 'a') as h5file:
        if h5writepath in h5file:
            work_group = h5file[h5writepath]
        else:
            work_group = h5file.create_group(h5writepath)

        crt = fragments[0]

        ms_spectra = fragments[1]

        mzs = []

        for i in range(len(ms_spectra)):
            mzs.append(ms_spectra[i][0, :])

        cmz = np.sort(np.unique(np.hstack(mzs)))

        cmz = condense_with_tolerance(cmz, delta_mz)

        n_cmz = cmz.shape[0]
        n_crt = crt.shape[0]
        n_samples = data_values.shape[0]

        if 'quantity_integrals' in work_group:
            q_i = work_group['quantity_integrals']
            q_i.resize((n_samples, n_crt))
        else:
            q_i = work_group.create_dataset('quantity_integrals',
                                            shape=(n_samples, n_crt),
                                            maxshape=(n_samples, None),
                                            chunks=True,
                                            compression="gzip",
                                            compression_opts=5,
                                            dtype=np.float64)

        q_i[:, :] = data_values[:, :]

        if 'integral_MS_spectra' in work_group:
            frag_pattern = work_group['integral_MS_spectra']
            frag_pattern.resize((n_crt, n_cmz))
        else:
            frag_pattern = work_group.create_dataset('integral_MS_spectra',
                                                     shape=(n_crt, n_cmz),
                                                     maxshape=(None, n_cmz),
                                                     chunks=True,
                                                     compression="gzip",
                                                     compression_opts=5,
                                                     dtype=np.float64)

        for i in range(n_crt):
            spec = np.zeros(n_cmz, dtype=np.float64)
            match_ind = nn_match(cmz,
                                 fragments[1][i][0, :],
                                 tolerance=delta_mz)
            mask = match_ind >= 0
            spec[mask] = fragments[1][i][1, match_ind[mask]]

            max_spec = np.max(spec)

            if max_spec > 0.0:
                spec = spec / max_spec

            frag_pattern[i, :] = spec[:]

        mh5.save_dataset(h5file, h5writepath + '/grouped_rts', crt * 60.0)
        mh5.save_dataset(h5file, h5writepath + '/grouped_cmz', cmz)

        if 'X_3D' in work_group:
            dc_X_3D = work_group['X_3D']
            dc_X_3D.resize((n_samples, n_cmz, n_crt))
        else:
            dc_X_3D = work_group.create_dataset('X_3D',
                                                shape=(n_samples, n_cmz,
                                                       n_crt),
                                                maxshape=(None, None, None),
                                                chunks=True,
                                                compression="gzip",
                                                compression_opts=5,
                                                dtype=np.float64)

        max_frag = np.sum(frag_pattern, axis=1)

        frag = np.array(frag_pattern)

        mask = max_frag > 0.0

        frag[mask, :] = frag[mask, :] / (max_frag[mask].reshape(
            mask.shape[0], 1))

        dc_X_3D[:, :, :] = frag.transpose().reshape((1, n_cmz, n_crt))[:, :, :]

        dc_X_3D[:, :, :] = dc_X_3D[:, :, :] * np.array(q_i).reshape(
            (n_samples, 1, n_crt))

        if 'dataset_names' in work_group:
            dataset_names = work_group['dataset_names']
            dataset_names.resize((n_samples, 2))
        else:
            dataset_names = work_group.create_dataset('dataset_names',
                                                      shape=(n_samples, 2),
                                                      chunks=(10000, 2),
                                                      maxshape=(None, 2),
                                                      compression="gzip",
                                                      compression_opts=5,
                                                      dtype=np.uint64)

        #utf_8 array to hold strings
        if 'utf_8' in work_group:
            utf_8 = work_group['utf_8']
        else:
            utf_8 = work_group.create_dataset('utf_8',
                                              shape=(1, ),
                                              maxshape=(None, ),
                                              chunks=(100000, ),
                                              compression="gzip",
                                              compression_opts=5,
                                              dtype=np.uint8)

        h5write_strings(dataset_names,
                        utf_8,
                        dataset_names_list,
                        overwrite=True)

        work_group.attrs['deconvoluted'] = deconvoluted
        work_group.attrs['Imported'] = True

        mh5.save_dataset(h5file, h5writepath + '/group_variance', intfrac)

        q = np.array(q_i).flatten()

        mask = q > 0.0

        q_masked = q[mask]

        max_integral = np.max(q_masked)

        if max_integral <= 0.0:
            max_integral = 1

        order = np.argsort(-q_masked)

        orderall = np.zeros(q.shape, dtype=np.int64)

        rel_integral = np.zeros(q.shape, dtype=np.float64)

        orderall[mask] = order

        rel_integral[mask] = q_masked / max_integral * 100.0

        mh5.save_dataset(h5file, h5writepath + '/rel_integrals',\
                                                 data = rel_integral.reshape(q_i.shape), compression_opts = 5)

        mh5.save_dataset(h5file, h5writepath + '/order',\
                                                 data = orderall.reshape(q_i.shape), compression_opts = 5)

        mean_peak_width = np.median(np.diff(crt))
        print(mean_peak_width)

        #work_group.attrs['mean_peak_width'] = mean_peak_width / 4 * 60.0;

        for i in range(len(dataset_names_list)):
            dataset_name = dataset_names_list[i]
            mh5.save_dataset(h5file, h5writepath + dataset_name + '/quantity_integrals',\
                                                data = q_i[i, :], compression_opts = 5)

            sub_group = work_group[dataset_name]
            sub_group.attrs['deconvoluted'] = deconvoluted
            sub_group.attrs['has_integrals'] = True
            sub_group.attrs['re_imported'] = True
            sub_group.attrs['is_OK'] = True
            sub_group.attrs['is_processed'] = True
            sub_group.attrs['is_continuous'] = False
            sub_group.attrs['is_raw'] = False
            sub_group.attrs['is_sample_dataset'] = True
Beispiel #10
0
    def bin_h5(self, dbfilepath, datasets):
        """
        Performs bining of GC-MS data to generate data intensity matrix [number of mz features x number of scans]
        common across all samples
        """

        with h5py.File(dbfilepath, 'a') as h5file:

            printlog(
                "\nPreparing for intra-sample m/z correction %s datasets from %s...\n"
                % (len(datasets), dbfilepath))
            dataindex = 0
            i = 0

            peak_width = 0.0
            dataset_count = 0

            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'cmz',
                             data=self.__binids,
                             compression_opts=5)
            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'crt',
                             data=self.__rtvals,
                             compression_opts=5)
            peak_finder = PeakFinder(dbfilepath, self.params['h5writepath'],
                                     '')

            for datasetid in datasets:
                dataindex = dataindex + 1
                try:
                    mzraw = mh5.load_dataset(h5file,
                                             '/raw' + datasetid + '/mz')
                    spraw = mh5.load_dataset(h5file,
                                             '/raw' + datasetid + '/sp')
                    scanidx = mh5.load_dataset(h5file,
                                               '/raw' + datasetid + '/scan')
                    rtraw = mh5.load_dataset(h5file,
                                             '/raw' + datasetid + '/time')
                    sp2D, cmz, crt = self.bin_sp(mzraw, spraw, scanidx, rtraw)
                    mh5.save_dataset(h5file,
                                     self.params['h5writepath'][:-1] +
                                     datasetid + '/sp',
                                     data=sp2D,
                                     compression_opts=5)
                    dataset_count += 1
                    peaks, npeaks = peak_finder.findpeaks_sp(np.sum(
                        sp2D, axis=1).flatten(),
                                                             gap=5)

                    if npeaks > 10:

                        threshold = median_threshold(peaks[0, :])

                        mask = peaks[0, :] >= threshold

                        ipeak_widths = peaks[10, mask]

                        if len(ipeak_widths) > 1:

                            sorted_peakwidths = ipeak_widths[np.argsort(
                                peaks[0, mask])]

                            slice_count = int(sorted_peakwidths.shape[0] / 10)
                            #print(slice_count)
                            if slice_count > 0:
                                quant = np.min(
                                    sorted_peakwidths[0:slice_count])
                            else:
                                quant = 0.0
                            #print(quant)
                            med = np.median(ipeak_widths) / 3.0
                            #print(med)

                            peak_width += max(med, quant)

                            i = i + 1
                        else:
                            printlog(
                                'No peaks passed threshold in %s. Skipping mean peak width estimation...'
                                % datasetid)

                    else:
                        printlog(
                            'Less than 10 peaks detected in %s. Skipping mean peak width estimation...'
                            % datasetid)

                    printlog(
                        '%s. %s: Successfully corrected and deposited -> %s%s'
                        % (dataindex, datasetid, os.path.basename(dbfilepath),
                           self.params['h5writepath']))

                    target_gname = self.params['h5writepath'][:-1] + datasetid
                    source_gname = '/raw' + datasetid

                    wgroup = h5file[target_gname]
                    sgroup = h5file[source_gname]

                    wgroup.attrs['is_raw'] = False
                    wgroup.attrs['is_OK'] = True
                    wgroup.attrs['is_processed'] = True
                    wgroup.attrs['is_continuous'] = True
                    wgroup.attrs['is_sample_dataset'] = True
                    wgroup.attrs['parent'] = np.string_(source_gname)
                    mh5.copy_meta_over(sgroup, wgroup)

                except Exception as inst:
                    printlog('%s. %s: Failed to be corrected' %
                             (dataindex, datasetid))
                    printlog(inst)
                    traceback.print_exc()

            peak_width = peak_width / i

            sizesp = np.array([len(crt), len(cmz), dataset_count])

            printlog('Estimated min rt peak width: %s sec or %.2f min' %
                     (peak_width, peak_width / 60.0))
            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'peak_width',
                             data=peak_width)
            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'mzrange',
                             data=self.mzrange)
            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'rtrange',
                             data=self.rtrange)
            mh5.save_dataset(h5file,
                             self.params['h5writepath'] + 'sizesp',
                             data=sizesp)