def get_average_profile(dbfilepath, dataid=''): if os.path.isfile(dbfilepath): datasets = mh5.get_dataset_names(dbfilepath, dataset_names=[]) if not datasets: printlog(dbfilepath + ' database file doesn' 't contain any MSI datasets') return else: printlog(str(dbfilepath) + ' database file is not found') return sizesp = mh5.load_dataset(dbfilepath, 'sizesp') sp_mean = np.zeros((sizesp[0], sizesp[1])) crt = mh5.load_dataset(dbfilepath, 'crt') #crt = crt / 60; j = -1 dataidx = np.array([]) for datasetid in datasets: j = j + 1 try: sp = mh5.load_dataset(dbfilepath, datasetid + dataid) sp_mean = sp_mean + sp dataidx = np.append(dataidx, j) except Exception as inst: printlog(os.path.basename(datasetid) + ": Failed to readin") printlog(inst) dataidx = dataidx.astype(int) datasets = list(map(datasets.__getitem__, (dataidx))) sp_mean = sp_mean / len(dataidx) return sp_mean, crt, datasets
def do_mzalignment(dbfilepath, method='bining', params={ 'binshift': 0.3, 'binsize': 1, 'units': 'Da', 'h5writepath': '/proc' }, istrain=1): """ Performs intra-sample correction of molecular m/z drifts between scans of individual samples Args: dbfilepath: a user-specified path to the h5 database file method: the method choice for intra-sample m/z drift corrections (e.g. bining by default) params: dictionary of parameter arguments for the correction method (e.g. ``{'binshift': 0.3, 'binsize':1, 'units': 'Da'})`` for bining) dbfilepath: processed intensitiy matrices """ dataset_names = mh5.get_dataset_names(dbfilepath, dataset_names=[], pathinh5='/raw') if not dataset_names: return else: params['h5writepath'] = h5Base.correct_h5path(params['h5writepath']) if istrain == 1: with h5py.File(dbfilepath, 'r') as h5file: cmzrange = mh5.load_dataset(h5file, '/raw/cmzrange') crtrange = mh5.load_dataset(h5file, '/raw/crtrange') #delete unnecessary variables and save into hdf5 database file if method == 'binning': mzAlignObj = Binmz(method, params, cmzrange, crtrange) mzAlignObj.save_procobj(dbfilepath, params['h5writepath']) elif istrain == 0: mzAlignObj = Binmz() mzAlignObj.load_procobj(dbfilepath, params['h5writepath']) mzAlignObj.bin_h5(dbfilepath, dataset_names)
def get_data(dbfilepath, h5readpath): """ Extract data from the h5file and output as a dictionary of 'x', 'y', 'ids', and 'colors' for each sample """ if h5readpath[0] != '/': h5readpath = '/'.join(['', h5readpath]) if os.path.isfile(dbfilepath): datasets = mh5.get_dataset_names(dbfilepath, dataset_names=[], pathinh5=h5readpath) if not datasets: printlog(dbfilepath + ' database file doesn' 't contain any MSI datasets') return else: printlog(str(dbfilepath) + ' database file is not found') return sizesp = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'sizesp'])) tics = np.zeros((sizesp[0], sizesp[2])) crt = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'crt'])) j = -1 dataidx = np.array([]) for datasetid in datasets: j = j + 1 try: sp = mh5.load_dataset(dbfilepath, ''.join([h5readpath, datasetid, '/sp'])) tics[:, j] = np.sum(sp, axis=1) dataidx = np.append(dataidx, j) except Exception as inst: printlog(os.path.basename(datasetid) + ": Failed to readin") printlog(inst) traceback.print_exc() dataidx = dataidx.astype(int) datasets = list(map(datasets.__getitem__, (dataidx))) tics = tics[:, dataidx] nrows, ncols = tics.shape sp = {'x': [], 'y': [], 'id': [], 'color': []} for i in range(ncols): sp['x'].append(crt / 60) sp['y'].append(tics[:, i]) sp['id'].append(datasets[i]) sp['color'] = colorgenerator(ncols) return sp
def get_dataset_names(dbfile): """ Exctracts data-set names from the hdf5 database file exists Args: dbfile: the path to the database file """ isdbfile = H5BaseMSIWorkflow.checkdbfile(dbfile) if isdbfile==1: datasets = mh5.get_dataset_names(dbfile,dataset_names=[]) if not datasets: print('%s database file doesn''t contain any MSI datasets'%str(dbfile)) else: datasets = [] return datasets
def get_data(dbfilepath, h5readpath='sp2D'): if h5readpath[0] != '/': h5readpath = '/'.join(['', 'sp2D']) if os.path.isfile(dbfilepath): datasets = mh5.get_dataset_names(dbfilepath, dataset_names=[], pathinh5=h5readpath) if not datasets: print(dbfilepath + ' database file doesn' 't contain any MSI datasets') return else: print(str(dbfilepath) + ' database file is not found') return sizesp = mh5.load_dataset(dbfilepath, os.path.join(h5readpath, 'sizesp')) tics = np.zeros((sizesp[0], sizesp[2])) crt = mh5.load_dataset(dbfilepath, os.path.join(h5readpath, 'crt')) j = -1 dataidx = np.array([]) for datasetid in datasets: j = j + 1 try: sp = mh5.load_dataset(dbfilepath, ''.join([h5readpath, datasetid, '/sp'])) tics[:, j] = np.sum(sp, axis=1) dataidx = np.append(dataidx, j) except: print(os.path.basename(datasetid) + ": Failed to readin") pass dataidx = dataidx.astype(int) datasets = list(map(datasets.__getitem__, (dataidx))) tics = tics[:, dataidx] nrows, ncols = tics.shape sp = {'x': [], 'y': [], 'id': [], 'color': []} for i in range(ncols): sp['x'].append(crt) sp['y'].append(tics[:, i]) sp['id'].append(datasets[i]) sp['color'] = colorgenerator(ncols) return sp
def do_profile_alignment(dbfilepath, method='rspa', params = {'recursion':1, 'minsegwidth':100, 'maxpeakshift':10, 'reference':'mean', 'h5readpath': '/proc', 'h5writepath': '/proc'}, istrain=1): """ Performs advanced adjustment for chromatographic peak position variations at full profile resolution using recursive segment-wise peak alignment strategy Args: dbfilepath: The database file path method: The choice of peak alignment method. Default value: 'rspa', i.e. Recursive segment-wise peak alignment. params: The dictionary of peak alignment parameters """ params['h5writepath'] = h5Base.correct_h5path(params['h5writepath']) params['h5readpath'] = h5Base.correct_h5path(params['h5readpath']) dataset_names = mh5.get_dataset_names(dbfilepath,dataset_names=[],pathinh5 = params['h5readpath'][:-1]) if not dataset_names: return peak_width = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'peak_width'); printlog('Loaded min estimated peak width: %s seconds'%peak_width); mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\ data=peak_width) if str(params['minsegwidth']).lower() == 'auto': params['minsegwidth'] = peak_width * 10.0; printlog('Parameter "minsegwidth" is set to %s'%params['minsegwidth']); else: try: params['minsegwidth'] = float(params['minsegwidth']) except: raise LoggingValueError('Error! %s value for parameter "minsegwidth" cannot be converted to float!'%params['minsegwidth']) if str(params['maxpeakshift']).lower() == 'auto': params['maxpeakshift'] = peak_width * 5; printlog('Parameter "maxpeakshift" is set to %s'%params['maxpeakshift']); else: try: params['maxpeakshift'] = float(params['maxpeakshift']) except: raise LoggingValueError('Error! %s value for parameter "maxpeakshift" cannot be converted to float!'%params['maxpeakshift']) if istrain==1: rtrange = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'rtrange') if method=='rspa': rtAlObj = RSPA(method,params,rtrange) elif istrain==0: rtAlObj = RSPA() rtAlObj.load_procobj(dbfilepath,params['h5readpath']) rtAlObj.aling_h5(dbfilepath, dataset_names, params['h5readpath'] , params['h5writepath']) if istrain==1: #save into hdf5 database file rtAlObj.export() rtAlObj.save_procobj(dbfilepath,params['h5writepath']) rtAlObj.save_proc_meta(dbfilepath,params['h5writepath'],params['h5readpath']) return
def do_noisefilter(dbfilepath, smoothmethod='sqfilter', smoothparams={ 'window': 5, 'degree': 3 }, baselinemethod='tophat', baselineparams={'frame': 90}, params={ 'h5readpath': '/sp2D', 'h5writepath': '/spproc2D' }, istrain=1): """ Performs adjustment for high frequency noise and lower frequency baseline distortions due to a variety of instrumental and experimental reasons Args: dbfilepath: a user-specified path to the h5 database file smoothmethod: The type of noise filtering method. Default value: 'sqfilter', i.e. the Savitzky-Golay filter. smoothparams: The dictionary of parameter arguments for noise filtering method baselinemethod: The type of a baseline correction method. Default value: 'tophat', i.e. the top-hat morphological filter. baselineparams: The dictionary of parameter arguments for baseline correction method """ params['h5writepath'] = h5Base.correct_h5path(params['h5writepath']) params['h5readpath'] = h5Base.correct_h5path(params['h5readpath']) dataset_names = mh5.get_dataset_names(dbfilepath, dataset_names=[], pathinh5=params['h5readpath'][:-1]) if not dataset_names: return peak_width = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'peak_width') printlog('Loaded min estimated peak width: %s seconds' % peak_width) mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\ data=peak_width) if str(smoothparams['window']).lower() == 'auto': smoothparams['window'] = peak_width * 0.5 printlog('Parameter "window" is set to %s' % smoothparams['window']) else: try: smoothparams['window'] = float(smoothparams['window']) except: raise LoggingValueError( 'Error! %s value for parameter "window" cannot be converted to float!' % smoothparams['window']) if str(baselineparams['frame']).lower() == 'auto': baselineparams['frame'] = peak_width * 15 printlog('Parameter "frame" is set to %s' % baselineparams['frame']) else: try: baselineparams['frame'] = float(baselineparams['frame']) except: raise LoggingValueError( 'Error! %s value for parameter "frame" cannot be converted to float!' % baselineparams['frame']) if istrain == 1: rtrange = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'rtrange') if smoothmethod != 'none': SmoothObject = SmoothFilter(smoothmethod, smoothparams, rtrange) else: SmoothObject = [] if baselinemethod != 'none': BaselineObject = BaselineFilter(baselinemethod, baselineparams, rtrange) else: BaselineObject = [] elif istrain == 0: SmoothObject = SmoothFilter() SmoothObject.load_procobj(dbfilepath, params['h5readpath']) if SmoothObject.method == '': SmoothObject = [] BaselineObject = BaselineFilter() if BaselineObject.method == '': BaselineObject = [] BaselineObject.load_procobj(dbfilepath, params['h5readpath']) filternoise_h5(dbfilepath, dataset_names, SmoothObject, BaselineObject, params) if istrain == 1: #save into hdf5 database file if (SmoothObject): SmoothObject.export(rtrange) SmoothObject.save_procobj(dbfilepath, params['h5writepath']) if (BaselineObject): BaselineObject.export(rtrange) BaselineObject.save_procobj(dbfilepath, params['h5writepath']) SmoothObject.save_proc_meta(dbfilepath, params['h5writepath'], params['h5readpath']) return
def export_HTML_list_of_samples_to_file(dbfilepath, h5readpath, output_prefix, plot_width=900, top_plot_height=300, bottom_plot_height=400, use_global_maximum=True): printlog('Exporting HTML view from [%s]%s to %s...' % (dbfilepath, h5readpath, output_prefix)) with h5py.File(dbfilepath, 'r') as h5file: if not os.path.exists(output_prefix): os.makedirs(output_prefix) h5readpath = h5Base.correct_h5path(h5readpath) dataset_names = mh5.get_dataset_names(h5file, dataset_names=[], pathinh5=h5readpath[:-1]) if not dataset_names: printlog('No datasets found! Nothing to do!') return dataset_count = len(dataset_names) all_rts_set = h5readpath + 'crt' if not (all_rts_set in h5file): printlog('Error! crt not found in [%s]%s ! Skipping...' % (dbfilepath, h5readpath)) return crt = h5file[all_rts_set] all_mzs_set = h5readpath + 'cmz' if not (all_mzs_set in h5file): printlog('Error! cmz not found in [%s]%s ! Skipping...' % (dbfilepath, h5readpath)) return cmz = h5file[all_mzs_set] if not os.path.exists('%s/samples' % output_prefix): os.makedirs('%s/samples' % output_prefix) with open('%s/index.html' % output_prefix, 'w') as fout: fout.write('\n'.join([ '<!DOCTYPE html>', '<html>', '<frameset cols="15%,85%">', '<frame src="sample_list.html">', '<frame src="samples/sample_0.html" name="MSSpectrum">', '</frameset>', '</html>', ])) with open('%s/sample_list.html' % output_prefix, 'w') as fout: fout.write('\n'.join([ '<!DOCTYPE html>', '<html>', '<head>', '<title>Samples</title>', '</head>', '<body>', ' <table border=1>', ' <tr><th>Samples</th></tr>' ])) global_max = 0.0 #global_max_sum = 0.0; if use_global_maximum: printlog('Evaluating global maximum...') for i in range(dataset_count): printlog('Sample: %s of %s' % (i + 1, dataset_count)) sp_set = h5readpath + dataset_names[i].lstrip('/') + '/sp' if not (sp_set in h5file): printlog('Error! sp not found in %s ! Skipping...' % (h5readpath + dataset_names[i].lstrip('/'))) else: sp = h5file[sp_set] m = np.max(sp) #mm = np.max(np.sum(sp, axis = 1)); global_max = max(m, global_max) #global_max_sum = max(mm, global_max_sum) for i in range(dataset_count): printlog('Sample: %s of %s' % (i + 1, dataset_count)) generate_sample_plot( h5file, h5readpath + dataset_names[i].lstrip('/'), i, dataset_names[i].lstrip('/'), output_prefix + '/samples/sample_%s.html' % i, crt, cmz, plot_width, top_plot_height, bottom_plot_height, use_global_maximum, global_max) fout.write('\n'.join([ '<tr><td>%s. ' % (i + 1), '<a href="samples/sample_%s.html" target="MSSpectrum">%s</a>' % (i, dataset_names[i].lstrip('/')), '</td></tr>', ])) fout.write('\n'.join([ ' </table>', '</body>', '</html>', ])) printlog('Done')
def do_export(dbfilepath, params = {'h5readpath':'/spproc2D_peakdetect', 'h5fullprofile':'/spproc2D', 'output_prefix':'%HDF5_file_name%', 'export_ms_peak_list':'yes', 'export_integral_table':'yes', #'export_full_peak_list':'no' } ): """ Exports processed data to CSV/TXT formats for analysis. Args: dbfilepath: a user-specified path to the h5 database file params: parameters to be used for export: h5readpath - path inside HDF5 to the processed dataset output_prefix - prefix to be used for the output files export_ms_peak_list - whether to export or not MS peak lists for NIST export_integral_table - whether to export or not the chromatographic peak integrals """ params['h5readpath'] = h5Base.correct_h5path(params['h5readpath']) params['h5fullprofile'] = h5Base.correct_h5path(params['h5fullprofile']) dbfilepath = os.path.abspath(dbfilepath); dataset_names = mh5.get_dataset_names(dbfilepath,dataset_names=[],pathinh5 = params['h5readpath'][:-1]) #print(dataset_names) if not dataset_names: printlog('No datasets found in the h5readpath provided: %s !'%params['h5readpath']); return output_prefix = params['output_prefix']; if '%HDF5_file_name%' in output_prefix: fname = os.path.splitext(os.path.basename(dbfilepath))[0]; output_prefix = output_prefix.replace('%HDF5_file_name%',fname); export_path = params['exportpath']; if export_path != '': export_path = os.path.abspath(export_path); else: export_path = os.path.split(dbfilepath)[0]; output_prefix = os.path.join(export_path, output_prefix); fpath = os.path.split(output_prefix)[0]; if not os.path.exists(fpath): os.makedirs(fpath); if not ('no' in params['export_ms_peak_list']): export_ms_peak_list_to_file(dbfilepath, params['h5readpath'], output_prefix, dataset_names, params['rts'], params['rt_tolerance']); if not ('no' in params['export_integral_table']): export_integral_table_to_file(dbfilepath, params['h5readpath'], output_prefix, dataset_names, params['samples'], params['rts'], params['rt_tolerance']); if not ('no' in params['export_metadata_table']): export_metadata_table_to_file(dbfilepath, params['h5readpath'], output_prefix, dataset_names, params['samples']); '''if not ('no' in params['export_all_peaks']): export_all_peaks_to_file(dbfilepath, params['h5readpath'], output_prefix, dataset_names); ''' return
def get_data(dbfilepath, h5readpath): """ Extract data from the h5file and output as a dictionary of 'x', 'y', 'ids', and 'colors' for each sample """ data = {'x': [], 'y': [], 'id': [], 'color': []} if h5readpath[0] != '/': h5readpath = '/'.join(['', h5readpath]) if os.path.isfile(dbfilepath): datasets = mh5.get_dataset_names(dbfilepath, dataset_names=[], pathinh5=h5readpath) if not datasets: printlog(dbfilepath + ' database file doesn''t contain any MSI datasets') return else: printlog(str(dbfilepath) + ' database file is not found') return sizesp = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'sizesp'])) crt = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'crt'])) try: ref2D = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'ref2D'])) print(ref2D.shape) except: ref2D = None; try: histc = mh5.load_dataset(dbfilepath, '/'.join([h5readpath+'_peakdetect', 'clustering_histogram'])) print(np.min(histc)) print(np.max(histc)) except: histc = None try: histc_threshold = mh5.load_dataset(dbfilepath, '/'.join([h5readpath+'_peakdetect', 'clustering_histogram_threshold'])) print(np.min(histc_threshold)) print(np.max(histc_threshold)) except: histc_threshold = None tics = np.zeros((sizesp[0], sizesp[2])) j = -1 dataidx = np.array([]) for datasetid in datasets: j = j + 1 try: sp = mh5.load_dataset(dbfilepath, ''.join([h5readpath, datasetid, '/sp'])) tics[:, j] = np.sum(sp, axis=1) dataidx = np.append(dataidx, j) except Exception as inst: printlog(os.path.basename(datasetid) + ": Failed to readin") printlog(inst) traceback.print_exc() dataidx = dataidx.astype(int) datasets = list(map(datasets.__getitem__, (dataidx))) tics = tics[:, dataidx] if not (histc is None): #histc[histc<=threshold] = 0 ''' if 'histc' in data: p.line(x = 'hx', y = 'histc', line_width=1, line_color = 'firebrick', source = source); if 'histc_threshold' in data: p.line(x = 'hx', y = 'histc_threshold', line_width = 2, line_color = 'navy', source = source); if 'ref2D' in data: p.line(x = 'refx', y = 'ref2D', line_width = 1, line_color = 'red', source = source) if 'picked_peaks' in data: p.circle(x = 'peak_x', y = 'picked_peaks', color = 'peak_color', size = 3, source = source) ''' med_int = np.median(tics,axis=1).flatten()[0:-1] + 1 med_int = np.sqrt(med_int) histc = med_int * histc mv = np.max(tics.flatten()) mv = mv/np.max(histc) threshold = median_threshold(histc); threshold *= mv histc *= mv; dd = int(len(histc)/100) - 1; dx = np.zeros((dd,), dtype = np.float64); dy = np.zeros((dd,), dtype = np.float64); for i in range(dd): dx[i] = np.mean(crt[i*100:i*100+100]) dy[i] = median_threshold(histc[i*100:i*100+100]) dy = smooth1D(dx,dy,10) dx[0] = np.min(crt) dx[-1] = np.max(crt) fit = interp1d(dx,dy,kind='cubic') fitted_threshold = fit(crt) nrows, ncols = tics.shape for i in range(ncols): data['x'].append(crt/60) data['y'].append(tics[:, i]) data['id'].append(datasets[i]) data['color'] = colorgenerator(ncols) return data