def filternoise_h5(dbfilepath, datasets, SmoothObject, BaselineObject, params): if (SmoothObject) and (BaselineObject): printlog('\n' + "Preparing for smoothing and baseline correction " + os.path.basename(dbfilepath) + '\n') printstring = 'Successfully smoothed, baseline corrected and deposited ->' elif (SmoothObject): printlog('\n' + "Preparing for smoothing " + os.path.basename(dbfilepath) + '\n') printstring = 'Successfully smoothed and deposited ->' elif (BaselineObject) == True: printlog('\n' + "Preparing for baseline correction " + os.path.basename(dbfilepath) + '\n') printstring = 'Successfully baseline corrected and deposited ->' else: return with h5py.File(dbfilepath, 'a') as h5file: i = 0 dataindex = 0 for datasetid in datasets: dataindex = dataindex + 1 try: sp2D = mh5.load_dataset( h5file, params['h5readpath'][:-1] + datasetid + '/sp') if (SmoothObject): sp2D = SmoothObject.fit(sp2D) if (BaselineObject): sp2D = BaselineObject.fit(sp2D) mh5.save_dataset(h5file,params['h5writepath'][:-1] + datasetid +'/sp',\ data=sp2D,compression_opts = 5) printlog('%s. %s: %s %s%s' %(dataindex, datasetid, printstring, \ os.path.basename(dbfilepath),params['h5writepath'])) i = i + 1 target_gname = params['h5writepath'][:-1] + datasetid source_gname = params['h5readpath'][:-1] + datasetid wgroup = h5file[target_gname] sgroup = h5file[source_gname] wgroup.attrs['is_raw'] = False wgroup.attrs['is_OK'] = True wgroup.attrs['is_processed'] = True wgroup.attrs['is_continuous'] = True wgroup.attrs['is_sample_dataset'] = True wgroup.attrs['parent'] = np.string_(source_gname) mh5.copy_meta_over(sgroup, wgroup) except Exception as inst: printlog('%s. %s: %s' % (dataindex, datasetid, 'Failed')) printlog(inst) traceback.print_exc()
def save_proc_meta(dbfilepath,h5writepath,h5readpath): if h5writepath!=h5readpath: mzrange = mh5.load_dataset(dbfilepath, h5readpath + 'mzrange') rtrange = mh5.load_dataset(dbfilepath, h5readpath + 'rtrange') cmz = mh5.load_dataset(dbfilepath, h5readpath + 'cmz') crt = mh5.load_dataset(dbfilepath, h5readpath + 'crt') sizesp = mh5.load_dataset(dbfilepath, h5readpath + 'sizesp') mh5.save_dataset(dbfilepath, h5writepath + 'mzrange',data=mzrange) mh5.save_dataset(dbfilepath, h5writepath + 'rtrange',data=rtrange) mh5.save_dataset(dbfilepath, h5writepath + 'cmz',data=cmz,compression_opts = 5) mh5.save_dataset(dbfilepath, h5writepath + 'crt',data=crt,compression_opts = 5) mh5.save_dataset(dbfilepath, h5writepath + 'sizesp',data=sizesp)
def aling_h5(self,dbfilepath,datasets,h5readpath,h5writepath): #if not self.ref2D: printlog("\nPerforming internal sample retention time profile alignment across %s datasets from \n%s...\n" % (len(datasets),dbfilepath)) dataindex = 0 with h5py.File(dbfilepath, 'a') as h5file: #mh5.save_dataset(h5file, h5writepath + '/ref2D', data = self.ref2D, compression_opts = 5) for datasetid in datasets: dataindex = dataindex + 1 try: sp2D = mh5.load_dataset(h5file, h5readpath[:-1] + datasetid + '/sp') nrt, nmz = sp2D.shape ref2D = np.mean(sp2D, axis = 1); mh5.save_dataset(h5file, h5writepath + datasetid.lstrip('/') + '/ref2D', data = ref2D, compression_opts = 5) for i in range(nmz): alprof = self.align(sp2D[:, i], ref2D[:]) sp2D[:, i] = alprof mh5.save_dataset(h5file, h5writepath[:-1] + datasetid+ '/sp', data = sp2D, compression_opts = 5) printlog('%s. %s: Successfully aligned and deposited -> %s%s' %(dataindex, datasetid, os.path.basename(dbfilepath), h5writepath)) target_gname = h5writepath[:-1] + datasetid; source_gname = h5readpath[:-1] + datasetid; wgroup = h5file[target_gname]; sgroup = h5file[source_gname]; wgroup.attrs['is_raw'] = False; wgroup.attrs['is_OK'] = True; wgroup.attrs['is_processed'] = True; wgroup.attrs['is_continuous'] = True; wgroup.attrs['is_sample_dataset'] = True; wgroup.attrs['parent'] = np.string_(source_gname) mh5.copy_meta_over(sgroup, wgroup); except Exception as inst: printlog('%s. %s: Failed to be deposited' %(dataindex, datasetid)) printlog(inst) traceback.print_exc()
def do_profile_alignment(dbfilepath, method='rspa', params = {'recursion':1, 'minsegwidth':100, 'maxpeakshift':10, 'reference':'mean', 'h5readpath': '/proc', 'h5writepath': '/proc'}, istrain=1): """ Performs advanced adjustment for chromatographic peak position variations at full profile resolution using recursive segment-wise peak alignment strategy Args: dbfilepath: The database file path method: The choice of peak alignment method. Default value: 'rspa', i.e. Recursive segment-wise peak alignment. params: The dictionary of peak alignment parameters """ params['h5writepath'] = h5Base.correct_h5path(params['h5writepath']) params['h5readpath'] = h5Base.correct_h5path(params['h5readpath']) dataset_names = mh5.get_dataset_names(dbfilepath,dataset_names=[],pathinh5 = params['h5readpath'][:-1]) if not dataset_names: return peak_width = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'peak_width'); printlog('Loaded min estimated peak width: %s seconds'%peak_width); mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\ data=peak_width) if str(params['minsegwidth']).lower() == 'auto': params['minsegwidth'] = peak_width * 10.0; printlog('Parameter "minsegwidth" is set to %s'%params['minsegwidth']); else: try: params['minsegwidth'] = float(params['minsegwidth']) except: raise LoggingValueError('Error! %s value for parameter "minsegwidth" cannot be converted to float!'%params['minsegwidth']) if str(params['maxpeakshift']).lower() == 'auto': params['maxpeakshift'] = peak_width * 5; printlog('Parameter "maxpeakshift" is set to %s'%params['maxpeakshift']); else: try: params['maxpeakshift'] = float(params['maxpeakshift']) except: raise LoggingValueError('Error! %s value for parameter "maxpeakshift" cannot be converted to float!'%params['maxpeakshift']) if istrain==1: rtrange = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'rtrange') if method=='rspa': rtAlObj = RSPA(method,params,rtrange) elif istrain==0: rtAlObj = RSPA() rtAlObj.load_procobj(dbfilepath,params['h5readpath']) rtAlObj.aling_h5(dbfilepath, dataset_names, params['h5readpath'] , params['h5writepath']) if istrain==1: #save into hdf5 database file rtAlObj.export() rtAlObj.save_procobj(dbfilepath,params['h5writepath']) rtAlObj.save_proc_meta(dbfilepath,params['h5writepath'],params['h5readpath']) return
def do_noisefilter(dbfilepath, smoothmethod='sqfilter', smoothparams={ 'window': 5, 'degree': 3 }, baselinemethod='tophat', baselineparams={'frame': 90}, params={ 'h5readpath': '/sp2D', 'h5writepath': '/spproc2D' }, istrain=1): """ Performs adjustment for high frequency noise and lower frequency baseline distortions due to a variety of instrumental and experimental reasons Args: dbfilepath: a user-specified path to the h5 database file smoothmethod: The type of noise filtering method. Default value: 'sqfilter', i.e. the Savitzky-Golay filter. smoothparams: The dictionary of parameter arguments for noise filtering method baselinemethod: The type of a baseline correction method. Default value: 'tophat', i.e. the top-hat morphological filter. baselineparams: The dictionary of parameter arguments for baseline correction method """ params['h5writepath'] = h5Base.correct_h5path(params['h5writepath']) params['h5readpath'] = h5Base.correct_h5path(params['h5readpath']) dataset_names = mh5.get_dataset_names(dbfilepath, dataset_names=[], pathinh5=params['h5readpath'][:-1]) if not dataset_names: return peak_width = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'peak_width') printlog('Loaded min estimated peak width: %s seconds' % peak_width) mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\ data=peak_width) if str(smoothparams['window']).lower() == 'auto': smoothparams['window'] = peak_width * 0.5 printlog('Parameter "window" is set to %s' % smoothparams['window']) else: try: smoothparams['window'] = float(smoothparams['window']) except: raise LoggingValueError( 'Error! %s value for parameter "window" cannot be converted to float!' % smoothparams['window']) if str(baselineparams['frame']).lower() == 'auto': baselineparams['frame'] = peak_width * 15 printlog('Parameter "frame" is set to %s' % baselineparams['frame']) else: try: baselineparams['frame'] = float(baselineparams['frame']) except: raise LoggingValueError( 'Error! %s value for parameter "frame" cannot be converted to float!' % baselineparams['frame']) if istrain == 1: rtrange = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'rtrange') if smoothmethod != 'none': SmoothObject = SmoothFilter(smoothmethod, smoothparams, rtrange) else: SmoothObject = [] if baselinemethod != 'none': BaselineObject = BaselineFilter(baselinemethod, baselineparams, rtrange) else: BaselineObject = [] elif istrain == 0: SmoothObject = SmoothFilter() SmoothObject.load_procobj(dbfilepath, params['h5readpath']) if SmoothObject.method == '': SmoothObject = [] BaselineObject = BaselineFilter() if BaselineObject.method == '': BaselineObject = [] BaselineObject.load_procobj(dbfilepath, params['h5readpath']) filternoise_h5(dbfilepath, dataset_names, SmoothObject, BaselineObject, params) if istrain == 1: #save into hdf5 database file if (SmoothObject): SmoothObject.export(rtrange) SmoothObject.save_procobj(dbfilepath, params['h5writepath']) if (BaselineObject): BaselineObject.export(rtrange) BaselineObject.save_procobj(dbfilepath, params['h5writepath']) SmoothObject.save_proc_meta(dbfilepath, params['h5writepath'], params['h5readpath']) return
def mzxml_reader(self, filelist): """ Performs reading chromatography-mass spectrometry data from NETCDF files Args: filelist: the list of files """ import mzxml hf = h5py.File(os.path.join(self.dbpath, self.dbname), 'w') minmass = np.inf maxmass = 0. minrt = np.inf maxrt = 0. resrt = -1. print(" \nReading mzXML files from '%s'...\n" % (self.dbpath)) #print(filelist) # loop through the files fileindex = 0 for filepath in filelist: fileindex = fileindex + 1 try: mzXML_scans = mzxml.load_mzxml_file(filepath) #raise Exception('Not available yet due to issues.') except Exception as inst: printlog(inst) traceback.print_exc() print("%s: Cannot open file" % filepath) continue try: mass_values = np.array([]) intensity_values = np.array([]) time_list_list = [] for scan in mzXML_scans: time_list_list.append(scan.retention_time) scan_masses_as_list, scan_intensities_as_list = zip( *scan.peaks) mass_values = np.concatenate( (mass_values, np.asarray(scan_masses_as_list))) intensity_values = np.concatenate( (intensity_values, np.asarray(scan_intensities_as_list))) time_list = np.asarray(time_list_list) minmass = np.min([minmass, np.min(mass_values)]) maxmass = np.max([maxmass, np.max(mass_values)]) if not len(mass_values) == len(intensity_values): print( "The length of mass_list is not equal to the length of intensity_list. Data not deposited!... !" ) continue if np.median(np.diff(mass_values)) < 0: # descending order arrangment of m/z values scanidx = np.diff(mass_values) >= 0 descend = 1 else: # ascending order arrangment of m/z values scanidx = np.diff(mass_values) <= 0 descend = 0 # calculate the number of scans scanidx = np.append(scanidx, True) nscans = np.sum(scanidx) scanidx = np.where(scanidx == True) scanidx = scanidx[0][np.arange(0, nscans)] # pre-allocate arrays scan_start = np.array([]).astype(int) scan_end = np.array([]).astype(int) # arrange data for hdf5 database file storage istart = 0 for iend in scanidx: imass_values = mass_values[np.arange(istart, iend + 1)] iintensity_values = intensity_values[np.arange( istart, iend + 1)] if len(imass_values) > 0: if descend == 1: mass_values[np.arange(istart, iend + 1)] = imass_values[::-1] intensity_values[np.arange( istart, iend + 1)] = iintensity_values[::-1] scan_start = np.append(scan_start, istart) scan_end = np.append(scan_end, iend) istart = iend + 1 # arrange data for hdf5 database file storage scan_indlist = np.vstack((scan_start, scan_end)) if np.median(np.diff(time_list)) < 0: time_list = time_list[::-1] minrt = np.min([minrt, np.min(time_list)]) maxrt = np.max([maxrt, np.max(time_list)]) if resrt == -1: resrt = np.median(np.diff(time_list)) else: resrt = 0.5 * (np.median(np.diff(time_list)) + resrt) # sanity check if not len(time_list) == len(scan_start): print( "The number of time points (%d) does not equal the number of scans (%d). Data not deposited!" % (len(time_list), len(scan_start))) continue else: # deposit data into hdf5 database file dpath = 'raw/' + os.path.splitext( os.path.basename(filepath))[0] try: ginfo = hf.create_group(dpath) try: m5db.save_dataset(ginfo, 'mz', data=mass_values, compression_opts=5) m5db.save_dataset(ginfo, 'sp', data=intensity_values, compression_opts=5) m5db.save_dataset(ginfo, 'time', data=time_list * self.time_mult) m5db.save_dataset(ginfo, 'scan', data=scan_indlist) # chunks=(nRows, nCols, 1)) print('%s. %s: Successfully deposited -> %s' % (fileindex, os.path.basename(filepath), self.dbname)) except: print('%s. %s: Failed to deposit' % (fileindex, os.path.basename(filepath))) except: print( '%s. %s: All files must have unique names: Failed to create a dataset in -> %s' % (fileindex, os.path.basename(filepath), self.dbname)) except: raise print('%s. %s: Failed to read in data' % (fileindex, os.path.basename(filepath))) m5db.save_dataset(hf, 'raw/cmzrange', data=[np.floor(minmass), np.ceil(maxmass)]) m5db.save_dataset(hf, 'raw/crtrange', data=[ np.floor(minrt * self.time_mult), resrt * self.time_mult, np.ceil(maxrt * self.time_mult) ]) printlog('Minimal mz: %s' % np.floor(minmass)) printlog('Maximal mz: %s' % np.ceil(maxmass)) printlog('Minimal rt (sec): %s' % np.floor(minrt * self.time_mult)) printlog('Maximal rt (sec): %s' % np.ceil(maxrt * self.time_mult))
def mzml_reader(self, filelist): """ Performs reading chromatography-mass spectrometry data from mzML files Args: filelist: the list of files """ import pymzml as pyml hf = h5py.File(os.path.join(self.dbpath, self.dbname), 'w') minmass = np.inf maxmass = 0. minrt = np.inf maxrt = 0. resrt = -1. printlog(" \nReading mzML files from '%s'...\n" % (self.dbpath)) # loop through the files fileindex = 0 for filepath in filelist: fileindex = fileindex + 1 try: dataset = pyml.run.Reader(filepath) except Exception as inst: printlog("%s: Cannot open file" % filepath) printlog(inst) traceback.print_exc() continue try: # pre-allocate arrays (faster than appending) n_scans = 0 n_values = 0 for sp in dataset: if 'MS:1000511' in sp: # (code 'MS:1000511' for ms level) if sp['MS:1000511'] == 1: # ms level 1 only if 'MS:1000016' in sp: # (code 'MS:1000016' for retention time value) X = np.array(sp.centroidedPeaks).astype(float) if len(X) > 0 and len(X.shape) == 2: healthcheck = np.logical_and( np.all(np.isfinite(X)), np.all(np.isreal(X))) if healthcheck: n_values = n_values + len(X) n_scans = n_scans + 1 else: printlog( 'Slice with numerical errors. Skipping...' ) else: printlog('Empty slice. Skipping...') scan_start = np.zeros(n_scans).astype(int) scan_end = np.zeros(n_scans).astype(int) mass_values = np.zeros(n_values).astype(float) intensity_values = np.zeros(n_values).astype(float) time_list = np.zeros(n_scans).astype(float) istart = 0 iscan = -1 dataset = pyml.run.Reader(filepath) for sp in dataset: if 'MS:1000511' in sp: # (code 'MS:1000511' for ms level) if sp['MS:1000511'] == 1: # ms level 1 only if 'MS:1000016' in sp: # (code 'MS:1000016' for retention time value) X = np.array(sp.centroidedPeaks).astype(float) if len(X) > 0 and len(X.shape) == 2: healthcheck = np.logical_and( np.all(np.isfinite(X)), np.all(np.isreal(X))) if healthcheck: imass_values = X[:, 0] iintensity_values = X[:, 1] iscan = iscan + 1 if np.median( np.diff(imass_values)) < 0: imass_values = imass_values[::-1] iintensity_values = iintensity_values[:: -1] scan_start[iscan] = istart iend = istart + len(imass_values) - 1 scan_end[iscan] = iend mass_values[np.arange( istart, iend + 1)] = imass_values[::-1] intensity_values[np.arange( istart, iend + 1)] = iintensity_values[::-1] istart = iend + 1 time_list[iscan] = sp['MS:1000016'] else: printlog( 'Slice with numerical errors. Skipping...' ) else: printlog('Empty slice. Skipping...') # arrange data for hdf5 database file storage scan_indlist = np.vstack((scan_start, scan_end)) # in case data are in descending order if np.median(np.diff(time_list)) < 0: time_list = time_list[::-1] # update retention time range and resolution minrt = np.min([minrt, np.min(time_list)]) maxrt = np.max([maxrt, np.max(time_list)]) if resrt == -1: resrt = np.median(np.diff(time_list)) else: resrt = 0.5 * (np.median(np.diff(time_list)) + resrt) # update mass range minmass = np.min([minmass, np.min(mass_values)]) maxmass = np.max([maxmass, np.max(mass_values)]) # sanity check if not len(time_list) == len(scan_start): printlog( "The number of time points (%d) does not equal the number of scans (%d). Data not deposited!" % (len(time_list), len(scan_start))) continue else: # deposit data into hdf5 database file dpath = 'raw/' + os.path.splitext( os.path.basename(filepath))[0] try: ginfo = hf.create_group(dpath) try: m5db.save_dataset(ginfo, 'mz', data=mass_values, compression_opts=5) m5db.save_dataset(ginfo, 'sp', data=intensity_values, compression_opts=5) m5db.save_dataset(ginfo, 'time', data=(time_list * self.time_mult)) m5db.save_dataset(ginfo, 'scan', data=scan_indlist) # chunks=(nRows, nCols, 1)) printlog('%s. %s: Successfully deposited -> %s' % (fileindex, os.path.basename(filepath), self.dbname)) ginfo.attrs['is_raw'] = True ginfo.attrs['is_OK'] = True ginfo.attrs['is_processed'] = False ginfo.attrs['is_continuous'] = True ginfo.attrs['is_sample_dataset'] = True #TODO: properly treat test and training data #ginfo.attrs['is_training'] = True; except Exception as inst: printlog('%s. %s: Failed to deposit' % (fileindex, os.path.basename(filepath))) printlog(inst) traceback.print_exc() except Exception as inst: printlog( '%s. %s: All files must have unique names: Failed to create a dataset in -> %s' % (fileindex, os.path.basename(filepath), self.dbname)) printlog(inst) traceback.print_exc() except Exception as inst: printlog('%s. %s: Failed to read in data' % (fileindex, os.path.basename(filepath))) printlog(inst) traceback.print_exc() m5db.save_dataset(hf, 'raw/cmzrange', data=[np.floor(minmass), np.ceil(maxmass)]) m5db.save_dataset(hf, 'raw/crtrange', data=[ np.floor(minrt * self.time_mult), resrt * self.time_mult, np.ceil(maxrt * self.time_mult) ]) printlog('Minimal mz: %s' % np.floor(minmass)) printlog('Maximal mz: %s' % np.ceil(maxmass)) printlog('Minimal rt (sec): %s' % np.floor(minrt * self.time_mult)) printlog('Maximal rt (sec): %s' % np.ceil(maxrt * self.time_mult))
def netcdf_reader(self, filelist): """ Performs reading chromatography-mass spectrometry data from NETCDF files Args: filelist: the list of files """ from netCDF4 import Dataset hf = h5py.File(os.path.join(self.dbpath, self.dbname), 'w') minmass = np.inf maxmass = 0. minrt = np.inf maxrt = 0. resrt = -1. printlog(" \nReading netCDF files from '%s'...\n" % (self.dbpath)) # loop through the files fileindex = 0 for filepath in filelist: fileindex = fileindex + 1 try: file = Dataset(filepath) except Exception as inst: printlog("%s: Cannot open file" % filepath) printlog(inst) traceback.print_exc() continue try: mass_values = np.array(file.variables[self.__mass_string][:]) intensity_values = np.array( file.variables[self.__intensity_string][:]) time_values = np.array(file.variables[self.__time_string][:]) scan_values = np.array(file.variables[self.__scan_string][:]) minmass = np.min([minmass, np.min(mass_values)]) maxmass = np.max([maxmass, np.max(mass_values)]) if not len(mass_values) == len(intensity_values): printlog( "The length of mass_list is not equal to the length of intensity_list. Data not deposited!... !" ) continue if not len(time_values) == len(scan_values): printlog( "The length of scan_aquisition_time is not equal to the length of scan_index. Data not deposited!... !" ) continue #removing empty scans dd = np.diff(scan_values) != 0 dd = np.append(dd, True) ddi = np.arange(scan_values.shape[0], dtype=np.int64)[dd] time_values = time_values[ddi] scan_values = scan_values[ddi] #if np.median(np.diff(mass_values))<0: if np.median(np.diff(time_values)) < 0: printlog( 'Decreasing scan time values detected! Data not deposited!...!' ) minrt = np.min([minrt, np.min(time_values)]) maxrt = np.max([maxrt, np.max(time_values)]) if resrt == -1: resrt = np.median(np.diff(time_values)) else: resrt = 0.5 * (np.median(np.diff(time_values)) + resrt) scan_end_values = np.append(scan_values[1:] - 1, mass_values.shape[0] - 1) scan_indcs = np.vstack([scan_values, scan_end_values]) dpath = 'raw/' + os.path.splitext( os.path.basename(filepath))[0] try: ginfo = hf.create_group(dpath) try: m5db.save_dataset(ginfo, 'mz', data=mass_values, compression_opts=5) m5db.save_dataset(ginfo, 'sp', data=intensity_values, compression_opts=5) m5db.save_dataset(ginfo, 'time', data=(time_values * self.time_mult)) m5db.save_dataset(ginfo, 'scan', data=scan_indcs) # chunks=(nRows, nCols, 1)) printlog('%s. %s: Successfully deposited -> %s' % (fileindex, os.path.basename(filepath), self.dbname)) ginfo.attrs['is_raw'] = True ginfo.attrs['is_OK'] = True ginfo.attrs['is_processed'] = False ginfo.attrs['is_continuous'] = True ginfo.attrs['is_sample_dataset'] = True #TODO: properly treat test and training data #ginfo.attrs['is_training'] = True; except Exception as inst: printlog('%s. %s: Failed to deposit' % (fileindex, os.path.basename(filepath))) printlog(inst) traceback.print_exc() except Exception as inst: printlog( '%s. %s: All files must have unique names: Failed to create a dataset in -> %s' % (fileindex, os.path.basename(filepath), self.dbname)) printlog(inst) traceback.print_exc() ''' # calculate the number of scans scanidx = np.append(scanidx,True) nscans = np.sum(scanidx) scanidx = np.where(scanidx==True) scanidx = scanidx[0][np.arange(0,nscans)] # pre-allocate arrays scan_start = np.array([]).astype(int) scan_end = np.array([]).astype(int) # arrange data for hdf5 database file storage istart = 0 for iend in scanidx: imass_values = mass_values[np.arange(istart,iend+1)] iintensity_values = intensity_values[np.arange(istart,iend+1)] if len(imass_values)>0: if descend==1: mass_values[np.arange(istart,iend+1)] = imass_values[::-1] intensity_values[np.arange(istart,iend+1)] = iintensity_values[::-1] scan_start = np.append(scan_start,istart) scan_end = np.append(scan_end,iend) istart = iend+1 # arrange data for hdf5 database file storage scan_indlist = np.vstack((scan_start,scan_end)) time_list = np.array(file.variables[self.__time_string][:]) if np.median(np.diff(time_list))<0: time_list = time_list[::-1] # sanity check if not len(time_list) == len(scan_start): printlog("The number of time points (%d) does not equal the number of scans (%d). Data not deposited!"%(len(time_list), len(scan_start))) continue else: # deposit data into hdf5 database file dpath = 'raw/' + os.path.splitext(os.path.basename(filepath))[0] try: ginfo = hf.create_group(dpath) try: m5db.save_dataset(ginfo,'mz', data = mass_values,compression_opts = 5) m5db.save_dataset(ginfo,'sp', data = intensity_values, compression_opts = 5) m5db.save_dataset(ginfo,'time',data = (time_list * self.time_mult) ) m5db.save_dataset(ginfo,'scan',data = scan_indlist) # chunks=(nRows, nCols, 1)) printlog('%s. %s: Successfully deposited -> %s' %(fileindex, os.path.basename(filepath), self.dbname)) ginfo.attrs['is_raw'] = True; ginfo.attrs['is_OK'] = True; ginfo.attrs['is_processed'] = False; ginfo.attrs['is_continuous'] = True; ginfo.attrs['is_sample_dataset'] = True; #TODO: properly treat test and training data #ginfo.attrs['is_training'] = True; except Exception as inst: printlog('%s. %s: Failed to deposit' %(fileindex, os.path.basename(filepath))) printlog(inst) traceback.print_exc() except Exception as inst: printlog('%s. %s: All files must have unique names: Failed to create a dataset in -> %s' %(fileindex, os.path.basename(filepath), self.dbname)) printlog(inst) traceback.print_exc() ''' except Exception as inst: printlog('%s. %s: Failed to read in data' % (fileindex, os.path.basename(filepath))) printlog(inst) traceback.print_exc() m5db.save_dataset(hf, 'raw/cmzrange', data=[np.floor(minmass), np.ceil(maxmass)]) m5db.save_dataset(hf, 'raw/crtrange', data=[ np.floor(minrt * self.time_mult), resrt * self.time_mult, np.ceil(maxrt * self.time_mult) ]) printlog('Minimal mz: %s' % np.floor(minmass)) printlog('Maximal mz: %s' % np.ceil(maxmass)) printlog('Minimal rt (sec): %s' % np.floor(minrt * self.time_mult)) printlog('Maximal rt (sec): %s' % np.ceil(maxrt * self.time_mult))
def reconstruct_datasets(dbfilename, quantity_integrals, fragments, h5writepath, h5fullprofile, delta_mz, delta_rt): if len(fragments[0]) != len(quantity_integrals[0]): printlog( 'RT lists between quantity integrals and fragmentation patterns do not match! %s vs %s' % (len(quantity_integrals[0]), len(fragments[0]))) return #rts, dataset_names, data_values, mzs, intfrac, deconvoluted (rts, dataset_names_list, data_values, imzs, intfrac, deconvoluted) = quantity_integrals dd = np.abs(np.subtract(fragments[0], quantity_integrals[0])) mask = dd > delta_rt dmismatch = np.sum(mask) if dmismatch > 0: printlog( 'RT lists between quantity integrals and fragmentation patterns do not match withing %.3f tolerance!' ) dif1 = fragments[0][mask] dif2 = quantity_integrals[0][mask] indx = np.arange(1, mask.shape[0] + 1, dtype=np.int64)[mask] for i in range(dif1.shape[0]): printlog( 'Mismatch in\t%s:\tQI_RT:\t%.3f\tFP_RT:\t%.3f\tAllowed tolerance:\t%.3f' % (indx[i], dif2[i], dif1[i], delta_rt)) with h5py.File(dbfilename, 'a') as h5file: if h5writepath in h5file: work_group = h5file[h5writepath] else: work_group = h5file.create_group(h5writepath) crt = fragments[0] ms_spectra = fragments[1] mzs = [] for i in range(len(ms_spectra)): mzs.append(ms_spectra[i][0, :]) cmz = np.sort(np.unique(np.hstack(mzs))) cmz = condense_with_tolerance(cmz, delta_mz) n_cmz = cmz.shape[0] n_crt = crt.shape[0] n_samples = data_values.shape[0] if 'quantity_integrals' in work_group: q_i = work_group['quantity_integrals'] q_i.resize((n_samples, n_crt)) else: q_i = work_group.create_dataset('quantity_integrals', shape=(n_samples, n_crt), maxshape=(n_samples, None), chunks=True, compression="gzip", compression_opts=5, dtype=np.float64) q_i[:, :] = data_values[:, :] if 'integral_MS_spectra' in work_group: frag_pattern = work_group['integral_MS_spectra'] frag_pattern.resize((n_crt, n_cmz)) else: frag_pattern = work_group.create_dataset('integral_MS_spectra', shape=(n_crt, n_cmz), maxshape=(None, n_cmz), chunks=True, compression="gzip", compression_opts=5, dtype=np.float64) for i in range(n_crt): spec = np.zeros(n_cmz, dtype=np.float64) match_ind = nn_match(cmz, fragments[1][i][0, :], tolerance=delta_mz) mask = match_ind >= 0 spec[mask] = fragments[1][i][1, match_ind[mask]] max_spec = np.max(spec) if max_spec > 0.0: spec = spec / max_spec frag_pattern[i, :] = spec[:] mh5.save_dataset(h5file, h5writepath + '/grouped_rts', crt * 60.0) mh5.save_dataset(h5file, h5writepath + '/grouped_cmz', cmz) if 'X_3D' in work_group: dc_X_3D = work_group['X_3D'] dc_X_3D.resize((n_samples, n_cmz, n_crt)) else: dc_X_3D = work_group.create_dataset('X_3D', shape=(n_samples, n_cmz, n_crt), maxshape=(None, None, None), chunks=True, compression="gzip", compression_opts=5, dtype=np.float64) max_frag = np.sum(frag_pattern, axis=1) frag = np.array(frag_pattern) mask = max_frag > 0.0 frag[mask, :] = frag[mask, :] / (max_frag[mask].reshape( mask.shape[0], 1)) dc_X_3D[:, :, :] = frag.transpose().reshape((1, n_cmz, n_crt))[:, :, :] dc_X_3D[:, :, :] = dc_X_3D[:, :, :] * np.array(q_i).reshape( (n_samples, 1, n_crt)) if 'dataset_names' in work_group: dataset_names = work_group['dataset_names'] dataset_names.resize((n_samples, 2)) else: dataset_names = work_group.create_dataset('dataset_names', shape=(n_samples, 2), chunks=(10000, 2), maxshape=(None, 2), compression="gzip", compression_opts=5, dtype=np.uint64) #utf_8 array to hold strings if 'utf_8' in work_group: utf_8 = work_group['utf_8'] else: utf_8 = work_group.create_dataset('utf_8', shape=(1, ), maxshape=(None, ), chunks=(100000, ), compression="gzip", compression_opts=5, dtype=np.uint8) h5write_strings(dataset_names, utf_8, dataset_names_list, overwrite=True) work_group.attrs['deconvoluted'] = deconvoluted work_group.attrs['Imported'] = True mh5.save_dataset(h5file, h5writepath + '/group_variance', intfrac) q = np.array(q_i).flatten() mask = q > 0.0 q_masked = q[mask] max_integral = np.max(q_masked) if max_integral <= 0.0: max_integral = 1 order = np.argsort(-q_masked) orderall = np.zeros(q.shape, dtype=np.int64) rel_integral = np.zeros(q.shape, dtype=np.float64) orderall[mask] = order rel_integral[mask] = q_masked / max_integral * 100.0 mh5.save_dataset(h5file, h5writepath + '/rel_integrals',\ data = rel_integral.reshape(q_i.shape), compression_opts = 5) mh5.save_dataset(h5file, h5writepath + '/order',\ data = orderall.reshape(q_i.shape), compression_opts = 5) mean_peak_width = np.median(np.diff(crt)) print(mean_peak_width) #work_group.attrs['mean_peak_width'] = mean_peak_width / 4 * 60.0; for i in range(len(dataset_names_list)): dataset_name = dataset_names_list[i] mh5.save_dataset(h5file, h5writepath + dataset_name + '/quantity_integrals',\ data = q_i[i, :], compression_opts = 5) sub_group = work_group[dataset_name] sub_group.attrs['deconvoluted'] = deconvoluted sub_group.attrs['has_integrals'] = True sub_group.attrs['re_imported'] = True sub_group.attrs['is_OK'] = True sub_group.attrs['is_processed'] = True sub_group.attrs['is_continuous'] = False sub_group.attrs['is_raw'] = False sub_group.attrs['is_sample_dataset'] = True
def bin_h5(self, dbfilepath, datasets): """ Performs bining of GC-MS data to generate data intensity matrix [number of mz features x number of scans] common across all samples """ with h5py.File(dbfilepath, 'a') as h5file: printlog( "\nPreparing for intra-sample m/z correction %s datasets from %s...\n" % (len(datasets), dbfilepath)) dataindex = 0 i = 0 peak_width = 0.0 dataset_count = 0 mh5.save_dataset(h5file, self.params['h5writepath'] + 'cmz', data=self.__binids, compression_opts=5) mh5.save_dataset(h5file, self.params['h5writepath'] + 'crt', data=self.__rtvals, compression_opts=5) peak_finder = PeakFinder(dbfilepath, self.params['h5writepath'], '') for datasetid in datasets: dataindex = dataindex + 1 try: mzraw = mh5.load_dataset(h5file, '/raw' + datasetid + '/mz') spraw = mh5.load_dataset(h5file, '/raw' + datasetid + '/sp') scanidx = mh5.load_dataset(h5file, '/raw' + datasetid + '/scan') rtraw = mh5.load_dataset(h5file, '/raw' + datasetid + '/time') sp2D, cmz, crt = self.bin_sp(mzraw, spraw, scanidx, rtraw) mh5.save_dataset(h5file, self.params['h5writepath'][:-1] + datasetid + '/sp', data=sp2D, compression_opts=5) dataset_count += 1 peaks, npeaks = peak_finder.findpeaks_sp(np.sum( sp2D, axis=1).flatten(), gap=5) if npeaks > 10: threshold = median_threshold(peaks[0, :]) mask = peaks[0, :] >= threshold ipeak_widths = peaks[10, mask] if len(ipeak_widths) > 1: sorted_peakwidths = ipeak_widths[np.argsort( peaks[0, mask])] slice_count = int(sorted_peakwidths.shape[0] / 10) #print(slice_count) if slice_count > 0: quant = np.min( sorted_peakwidths[0:slice_count]) else: quant = 0.0 #print(quant) med = np.median(ipeak_widths) / 3.0 #print(med) peak_width += max(med, quant) i = i + 1 else: printlog( 'No peaks passed threshold in %s. Skipping mean peak width estimation...' % datasetid) else: printlog( 'Less than 10 peaks detected in %s. Skipping mean peak width estimation...' % datasetid) printlog( '%s. %s: Successfully corrected and deposited -> %s%s' % (dataindex, datasetid, os.path.basename(dbfilepath), self.params['h5writepath'])) target_gname = self.params['h5writepath'][:-1] + datasetid source_gname = '/raw' + datasetid wgroup = h5file[target_gname] sgroup = h5file[source_gname] wgroup.attrs['is_raw'] = False wgroup.attrs['is_OK'] = True wgroup.attrs['is_processed'] = True wgroup.attrs['is_continuous'] = True wgroup.attrs['is_sample_dataset'] = True wgroup.attrs['parent'] = np.string_(source_gname) mh5.copy_meta_over(sgroup, wgroup) except Exception as inst: printlog('%s. %s: Failed to be corrected' % (dataindex, datasetid)) printlog(inst) traceback.print_exc() peak_width = peak_width / i sizesp = np.array([len(crt), len(cmz), dataset_count]) printlog('Estimated min rt peak width: %s sec or %.2f min' % (peak_width, peak_width / 60.0)) mh5.save_dataset(h5file, self.params['h5writepath'] + 'peak_width', data=peak_width) mh5.save_dataset(h5file, self.params['h5writepath'] + 'mzrange', data=self.mzrange) mh5.save_dataset(h5file, self.params['h5writepath'] + 'rtrange', data=self.rtrange) mh5.save_dataset(h5file, self.params['h5writepath'] + 'sizesp', data=sizesp)