def __init__(self, filereadinfo, dbname='', dbpath='', time_mult=1.0): """ Class parameter initialization """ self.dbname = '' self.dbpath = '' self.time_mult = time_mult if dbname != '': self.dbpath = os.path.dirname(dbname) self.dbname = os.path.basename(dbname) if self.dbpath == '': if os.path.isdir(dbpath): self.dbpath = dbpath else: self.dbpath = os.getcwd() try: if not os.path.isdir(self.dbpath): os.makedirs(self.dbpath) except: self.dbpath = os.getcwd() if self.dbname == '': printlog( 'Output database file name not provided. Setting output file name to:' ) self.dbname = "msdata__" + time.strftime("%H%M_%d_%m_%Y") + ".h5" printlog(self.dbname) else: self.dbname = os.path.splitext(self.dbname)[0] + '.h5'
def recursive_copy_group_contents(source, target, overwrite=True): """ Recursively copy group contents considering overwrite setting """ for attribute in source.attrs.keys(): if not (attribute in target.attrs) or overwrite: target.attrs[attribute] = source.attrs[attribute] for key in source.keys(): item = source[key] if isinstance(item, h5py.Group): if key in target: oitem = target[key] if not isinstance(oitem, h5py.Group): raise H5FileError( 'Error! Cannot copy group %s from %s into existing dataset %s in %s!' % (key, source.name, key, target.name)) else: oitem = target.create_group(key) recursive_copy_group_contents(item, oitem, overwrite) elif isinstance(item, h5py.Dataset): if not (key in target) or overwrite: if key in target: del target[key] source.copy(item, target, name=key) else: printlog('Unsupported item %s of type %s ignored!' % (key, type(item)))
def toc(id_tic = 'default', printing = True): """ Returns and prints (optionally) the number of seconds that passed since corresponding tic(id) was called. Args: id_tic: string or number to be used to identify corresponding starting point. Default: 'default'. printing: boolean, determines if the number of elapsed seconds is printed. Default: True Returns: number of seconds that passed since corresponding tic(id) was called as float. """ global __dttime; try: t = time.time() - __dttime[id_tic]; except: printlog('tic "%s" not found! Did you forget to call tic(%s)?'%(id_tic, id_tic)); return 0.0; if printing: printlog('%s seconds'%t) return t;
def load_dataset(dbfilepath, pathinh5): pathinh5 = re.sub('//', '/', pathinh5) dataset = [] if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file_group = h5py.File(dbfilepath, 'a') isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): h5file_group = dbfilepath isdbfile = 0 else: return dataset try: isdata = pathinh5 in h5file_group except Exception as inst: printlog(inst) traceback.print_exc() return dataset if isdata == True: dataset = h5file_group[pathinh5][()] if isdbfile == 1: h5file_group.close() return dataset
def cp(self, source, target): """ Copy source to target path """ target = abs_hdf5_path(target, self.h5path).rstrip('/') if not target in self.h5file: target_group = self.h5file.create_group(target) else: target_group = self.h5file[target] if not isinstance(target_group, h5py.Group): printlog( 'Error! Destination %s exists and it is not a group! Cannot copy %s!' % (target, source)) return source_path = abs_hdf5_path(source, self.h5path) selected_items = get_items_list_from_hdf5(self.h5file, source_path, recursive=True) source_path = source_path.split('/')[:-1] source_path_len = len(source_path) for item in selected_items: printlog('Copying %s to %s...' % (item, target)) rel_item = '/'.join(item.split('/')[source_path_len:]) copy_hdf5_item(self.h5file, item, target + '/' + rel_item)
def abs_hdf5_path(h5path, reference_path='/'): """ Make HDF5 relative path absolute """ if not reference_path.startswith('/'): raise OperationError('Reference path should be absolute path!') if not h5path.startswith('/'): h5path = reference_path.rstrip('/') + '/' + h5path end_slash = h5path.endswith('/') h5path = h5path.split('/') newpath = [] for i in range(len(h5path)): if h5path[i] == '..': if len(newpath) > 0: del newpath[-1] else: printlog( 'Error! Cannot go above root with ".." in path! Staying at root!' ) elif h5path[i] != '.' and h5path[i] != '': newpath.append(h5path[i]) result = '/%s' % ('/'.join(newpath)) if end_slash: if not result.endswith('/'): result += '/' return result
def generate_sample_plot(h5file, dataset_path, dataset_index, dataset_name, output_file, crt, cmz, plot_width, top_plot_height, bottom_plot_height, use_global_max, global_max): sp_set = dataset_path + '/sp' if not (sp_set in h5file): printlog('Error! sp not found in %s ! Skipping...' % (dataset_path)) return sp = np.array(h5file[sp_set]) #sp_int = np.sum(sp, axis = 1); crt = np.array(crt) crt = crt / 60.0 top_plot = _make_top_rt_plot( crt, sp, '%s. RT Integral profile, linear scale' % dataset_name, plot_width, top_plot_height, use_global_max, global_max) bottom_plot = _make_bottom_rt_plot( crt, cmz, sp, '%s. Full profile, log scale' % dataset_name, plot_width, bottom_plot_height, True, use_global_max, global_max) #bottom_plot2 = _make_bottom_rt_plot(crt, cmz, sp, '%s. Full profile, linear scale'%dataset_name, plot_width, bottom_plot_height, False, use_global_max, global_max); bottom_plot.x_range = top_plot.x_range #bottom_plot2.x_range = top_plot.x_range; #bottom_plot2.y_range = bottom_plot.y_range; script, div = components( gridplot([ [top_plot], [bottom_plot], #[bottom_plot2] ])) with open(output_file, 'w') as fspec: fspec.write('\n'.join([ '<!DOCTYPE html>', '<html lang="en">', ' <head>', ' <meta charset="utf-8">', ' <title>RT Spectrum for all peaks</title>', CDN.render(), ' <link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">', ' </head>', ' <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>', ' <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>', ' <body>', div, ' </body>', script, '<style>', '</style>', ' </html>', ]))
def view_tics(dbfilepath, method='', params=''): printlog('\nExtracting TICs.....') h5readpath = params['h5readpath'] data = get_data(dbfilepath, h5readpath) figtitle = 'Total Ion Chromatograms' dirname = os.path.dirname(dbfilepath) if params['outputfile'] == '': filename = 'bp' + time.strftime("%H%M_%d_%m_%Y") + '.html' else: filename = params['outputfile'] output_file(os.path.join(dirname, filename)) source = ColumnDataSource(data) p = figure(width=params['plot_width'], height=params['plot_height'], title=figtitle, x_axis_label='Time(mins)', y_axis_label='Intensity') p.multi_line(xs = 'x', ys = 'y', line_width = 1, line_color = 'color', line_alpha = 0.6, hover_line_color = 'color', hover_line_alpha = 1.0, source = source) print(data.keys()) if 'histc' in data: p.line(x = 'hx', y = 'histc', line_width=1, line_color = 'firebrick', source = source); if 'histc_threshold' in data: p.line(x = 'hx', y = 'histc_threshold', line_width = 2, line_color = 'navy', source = source); if 'ref2D' in data: p.line(x = 'refx', y = 'ref2D', line_width = 1, line_color = 'red', source = source) if 'picked_peaks' in data: p.circle(x = 'peak_x', y = 'picked_peaks', color = 'peak_color', size = 3, source = source) p.add_tools(HoverTool(show_arrow=False, line_policy='next', tooltips=[ ('id', '@id')])) try: p.toolbar.active_inspect = [None] except: pass if params['display'] == 'yes': show(p) else: save(p)
def mkdir(self, target): """ Make group defined by target """ target = abs_hdf5_path(target, self.h5path).rstrip('/') printlog('Creating group %s' % (target)) if target in self.h5file: printlog('Error! %s already exists!' % target) else: self.h5file.create_group(target)
def save_dataset(dbfilepath, pathinh5, data, chunksize='', compression_opts=''): pathinh5 = re.sub('//', '/', pathinh5) if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file_group = h5py.File(dbfilepath, 'a') isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): h5file_group = dbfilepath isdbfile = 0 else: return try: isdata = pathinh5 in h5file_group except Exception as inst: printlog(inst) traceback.print_exc() return if isdata: fdata = h5file_group[pathinh5] if (fdata.shape == data.shape) and (fdata.dtype == data.dtype): fdata[...] = data return else: printlog('Deleting original') del h5file_group[pathinh5] if (not chunksize) and (not compression_opts): h5file_group.create_dataset(pathinh5, data=data) elif (chunksize) and (compression_opts): h5file_group.create_dataset(pathinh5, data=data, chunks=chunksize, compression="gzip", compression_opts=compression_opts) elif (chunksize): h5file_group.create_dataset(pathinh5, data=data, chunks=chunksize) elif (compression_opts): h5file_group.create_dataset(pathinh5, data=data, chunks=True, compression="gzip", compression_opts=compression_opts) if isdbfile == 1: h5file_group.close() return
def load_preproc_obj(dbfilepath, procid, pathinh5=''): """ **Loads the pre-processing parameters of a module from the hdf5 database.** Args: dbfilepath: the name and path to the hdf5-database file procid: the module identifier pathinh5: the path in the hdf5 file for object storage """ h5objpath = pathinh5 + procid h5objpath = re.sub('//', '/', h5objpath) ProcObj = {} if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file_group = h5py.File(dbfilepath, 'a') isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): h5file_group = dbfilepath isdbfile = 0 else: return ProcObj try: isobj = h5objpath in h5file_group except Exception as inst: printlog(inst) traceback.print_exc() return ProcObj if isobj == False: return ProcObj # check whether this object is part of the preprocessing workflow h5obj = h5file_group[h5objpath] for i_name in h5obj.keys(): if isinstance(h5obj[i_name], h5py.Group): h5subobj = h5obj[i_name] subProcObj = {} for j_name in h5subobj.keys(): subProcObj[j_name] = load_dataset(h5subobj, j_name) ProcObj[i_name] = subProcObj else: ProcObj[i_name] = load_dataset(h5obj, i_name) if isdbfile == 1: h5file_group.close() return ProcObj
def rm(self, target): """ Remove items defined by target """ printlog('Removing %s...' % (target)) targets = reversed( get_items_list_from_hdf5(self.h5file, abs_hdf5_path(target, self.h5path), recursive=True)) for i in targets: del h5file[i]
def __init__(self, stat_model, methodparams): """ ANOVA method initialization. Args: stat_model - instance of the StatisticalModel class; methodparams - dictionary of supplied method parameters; """ self.stat_model = stat_model; self.methodparams = methodparams; printlog('\nInitialized ANOVA analyser...'); self.p_value_threshold = self.methodparams['p_value']; printlog('\np-value threshold: %.5f'%self.p_value_threshold); #if self.methodparams['interactions_expected'] == 'no': # self.anova_type = 2; # printlog('Interactions not expected. Using ANOVA type 2...') #else: # self.anova_type = 3; # printlog('Interactions expected. Using ANOVA type 3...') if self.methodparams['robust'] == 'None': self.robust = None; printlog('Robust covariance not used...') else: self.robust = self.methodparams['robust']; printlog('Using covariance %s...'%self.robust);
def get_refsp_h5(self,dbfilepath,datasets,h5readpath): with h5py.File(dbfilepath, 'r') as h5file: i = 0 printlog("\nPreparing reference profile for inter-sample retention time drift alignment across %s datasets from \n%s...\n" % (len(datasets),dbfilepath)) dataindex = 0 for datasetid in datasets: dataindex = dataindex + 1 try: sp2D = mh5.load_dataset(h5file,h5readpath[:-1] + datasetid+ '/sp') if i==0: i = i + 1 ref2D = sp2D continue if self.reference=='mean': ref2D = (sp2D + ref2D) i = i + 1 printlog('%s. %s: Successfully updated from -> %s%s' %(dataindex, datasetid, os.path.basename(dbfilepath),h5readpath)) except Exception as inst: printlog('%s. %s: Failed' %(dataindex, datasetid)) printlog(inst) traceback.print_exc() if self.reference=='mean': self.ref2D = ref2D/i
def get_average_profile(dbfilepath, dataid=''): if os.path.isfile(dbfilepath): datasets = mh5.get_dataset_names(dbfilepath, dataset_names=[]) if not datasets: printlog(dbfilepath + ' database file doesn' 't contain any MSI datasets') return else: printlog(str(dbfilepath) + ' database file is not found') return sizesp = mh5.load_dataset(dbfilepath, 'sizesp') sp_mean = np.zeros((sizesp[0], sizesp[1])) crt = mh5.load_dataset(dbfilepath, 'crt') #crt = crt / 60; j = -1 dataidx = np.array([]) for datasetid in datasets: j = j + 1 try: sp = mh5.load_dataset(dbfilepath, datasetid + dataid) sp_mean = sp_mean + sp dataidx = np.append(dataidx, j) except Exception as inst: printlog(os.path.basename(datasetid) + ": Failed to readin") printlog(inst) dataidx = dataidx.astype(int) datasets = list(map(datasets.__getitem__, (dataidx))) sp_mean = sp_mean / len(dataidx) return sp_mean, crt, datasets
def load_mgf_fragments(fragfilename): rt_peaks = [] ms_fragments = [] reading_peaks = False in_record = False current_ms_fragments = [] current_rt = 0.0 with open(fragfilename, 'r') as finp: for s in finp: s = s.rstrip('\n') if in_record: if reading_peaks: if s.startswith('END'): reading_peaks = False in_record = False ms_fragments.append(current_ms_fragments) rt_peaks.append(current_rt) current_ms_fragments = [] else: s = s.split(' ') current_ms_fragments[0].append(float(s[0])) current_ms_fragments[1].append(float(s[1])) elif s.startswith('RTINSECONDS='): current_rt = float(s.split('=')[1]) / 60.0 elif '=' in s: pass elif ' ' in s: s = s.split(' ') try: mz = float(s[0]) intens = float(s[1]) reading_peaks = True current_ms_fragments.append([mz]) current_ms_fragments.append([intens]) except: printlog( '%s does not contain proper float pair! Ignoring...' % s) elif s.startswith('BEGIN IONS'): in_record = True for i in range(len(ms_fragments)): ms_fragments[i] = np.array(ms_fragments[i], dtype=np.float64) return np.array(rt_peaks, dtype=np.float64), ms_fragments
def write_meta_values(h5, h5writepath, sampleid, metids, metvals, overwrite=True): """ Write meta values to the open h5 file according to the sample id. Args: h5: HDF5 File instance (opened and writeable). h5writepath: string containing path to the group of samples to add metadata to. sampleid: name of the sample, same as the name of the group containing sample data. Matching is case sensitive. metids: list of metadata variable names metvals: list of metadata variable values overwrite: boolean, defines if already existing metadata values will be overwritten. Returns: None """ group_name = h5writepath + sampleid if group_name in h5: printlog('Adding metadata to %s...' % group_name) group = h5[group_name] if 'MetaData' in group: metagroup = group['MetaData'] if not isinstance(metagroup, h5py.Group): raise HDF5FormatError( 'Error! A dataset "MetaData" exists in %s of %s!' % (group_name, h5.filename)) else: metagroup = group.create_group('MetaData') group.attrs['has_metadata'] = True for i in range(len(metids)): metid = str(metids[i]) if metid != '': if not (metid in metagroup.attrs) or overwrite: metagroup.attrs[metid] = trydecode(metvals[i]) else: printlog('%s not found in %s! Skipping...' % (group_name, h5.filename))
def get_traindata_names(dbfilepath, dbroot='', dataset_names=[], istrain=1): """ Recursively exctracts dataset names from hdf5 database """ if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file = h5py.File(dbfilepath, 'r') item = h5file isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): item = dbfilepath isdbfile = 0 else: return dataset_names for key, val in iteritem(dict(item)): try: subitem = dict(val) if ('istrain' in subitem) and ('Sp' in subitem): if load_dataset(item, val.name + '/istrain') == istrain: success = 1 else: success = 0 else: success = 0 except Exception as inst: printlog(inst) traceback.print_exc() success = 0 if success == 1: dataset_names.append(val.name) elif isinstance(val, h5py.Group): dbroot = dbroot + val.name dataset_names = get_traindata_names(val, dbroot, dataset_names, istrain) if isdbfile == 1: h5file.close() return dataset_names
def aling_h5(self,dbfilepath,datasets,h5readpath,h5writepath): #if not self.ref2D: printlog("\nPerforming internal sample retention time profile alignment across %s datasets from \n%s...\n" % (len(datasets),dbfilepath)) dataindex = 0 with h5py.File(dbfilepath, 'a') as h5file: #mh5.save_dataset(h5file, h5writepath + '/ref2D', data = self.ref2D, compression_opts = 5) for datasetid in datasets: dataindex = dataindex + 1 try: sp2D = mh5.load_dataset(h5file, h5readpath[:-1] + datasetid + '/sp') nrt, nmz = sp2D.shape ref2D = np.mean(sp2D, axis = 1); mh5.save_dataset(h5file, h5writepath + datasetid.lstrip('/') + '/ref2D', data = ref2D, compression_opts = 5) for i in range(nmz): alprof = self.align(sp2D[:, i], ref2D[:]) sp2D[:, i] = alprof mh5.save_dataset(h5file, h5writepath[:-1] + datasetid+ '/sp', data = sp2D, compression_opts = 5) printlog('%s. %s: Successfully aligned and deposited -> %s%s' %(dataindex, datasetid, os.path.basename(dbfilepath), h5writepath)) target_gname = h5writepath[:-1] + datasetid; source_gname = h5readpath[:-1] + datasetid; wgroup = h5file[target_gname]; sgroup = h5file[source_gname]; wgroup.attrs['is_raw'] = False; wgroup.attrs['is_OK'] = True; wgroup.attrs['is_processed'] = True; wgroup.attrs['is_continuous'] = True; wgroup.attrs['is_sample_dataset'] = True; wgroup.attrs['parent'] = np.string_(source_gname) mh5.copy_meta_over(sgroup, wgroup); except Exception as inst: printlog('%s. %s: Failed to be deposited' %(dataindex, datasetid)) printlog(inst) traceback.print_exc()
def ls(self, recursive=False, selection='*', show_attributes=False, show_groups=True, show_datasets=True): source_path = abs_hdf5_path(selection, self.h5path) selected_items = get_items_list_from_hdf5(self.h5file, source_path, recursive) source_path = source_path.split('/')[:-1] source_path_len = len(source_path) for item_name in selected_items: item = self.h5file[item_name] item_relpath = '/'.join(item_name.split('/')[source_path_len:]) if isinstance(item, h5py.Group) and show_groups: printlog('[%s]' % item_relpath) if show_attributes: print_attributes(item) elif isinstance(item, h5py.Dataset) and show_datasets: printlog(' %s ' % item_relpath) if show_attributes: print_attributes(item) else: printlog('?%s ' % item_relpath)
def __init__(self, filereadinfo, dbname='', dbpath='', time_mult=1.0): """ Class parameter initialization """ self.dbname = '' self.dbpath = '' self.time_mult = time_mult # the keys used to retrieve certain data from the NetCDF file self.__mass_string = filereadinfo['massid'] self.__intensity_string = filereadinfo['specid'] self.__scan_string = filereadinfo['scanid'] self.__time_string = filereadinfo['timeid'] if dbname != '': self.dbpath = os.path.dirname(dbname) self.dbname = os.path.basename(dbname) if self.dbpath == '': if os.path.isdir(dbpath): self.dbpath = dbpath else: self.dbpath = os.getcwd() try: if not os.path.isdir(self.dbpath): os.makedirs(self.dbpath) except: self.dbpath = os.getcwd() if self.dbname == '': printlog( 'Output database file name not provided. Setting output file name to:' ) self.dbname = "msdata__" + time.strftime("%H%M_%d_%m_%Y") + ".h5" printlog(self.dbname) else: self.dbname = os.path.splitext(self.dbname)[0] + '.h5'
def print_structure_h5db(dbfilepath, dbroot='', offset=' '): """Prints the HDF5 database structure""" if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file = h5py.File(dbfilepath, 'r') item = h5file isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): item = dbfilepath isdbfile = 0 else: return if isinstance(item, h5py.File): printlog(item.file, '(File)', item.name) elif isinstance(item, h5py.Dataset): printlog('(Dataset)', item.name, ' len =', item.shape) #, g.dtype elif isinstance(item, h5py.Group): printlog('(Group)', item.name) else: printlog('Warning: The item type is unkown', item.name) sys.exit("execution is terminated") if isinstance(item, h5py.File) or isinstance(item, h5py.Group): for key, val in dict(item).iteritems(): subitem = val printlog(offset, key) #," ", subg.name #, val, subg.len(), type(subg), dbroot = dbroot + 'i' print_structure_h5db(subitem, dbroot=dbroot, offset=' ') if isdbfile == 1: h5file.close()
def get_data(dbfilepath, h5readpath): """ Extract data from the h5file and output as a dictionary of 'x', 'y', 'ids', and 'colors' for each sample """ if h5readpath[0] != '/': h5readpath = '/'.join(['', h5readpath]) if os.path.isfile(dbfilepath): datasets = mh5.get_dataset_names(dbfilepath, dataset_names=[], pathinh5=h5readpath) if not datasets: printlog(dbfilepath + ' database file doesn' 't contain any MSI datasets') return else: printlog(str(dbfilepath) + ' database file is not found') return sizesp = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'sizesp'])) tics = np.zeros((sizesp[0], sizesp[2])) crt = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'crt'])) j = -1 dataidx = np.array([]) for datasetid in datasets: j = j + 1 try: sp = mh5.load_dataset(dbfilepath, ''.join([h5readpath, datasetid, '/sp'])) tics[:, j] = np.sum(sp, axis=1) dataidx = np.append(dataidx, j) except Exception as inst: printlog(os.path.basename(datasetid) + ": Failed to readin") printlog(inst) traceback.print_exc() dataidx = dataidx.astype(int) datasets = list(map(datasets.__getitem__, (dataidx))) tics = tics[:, dataidx] nrows, ncols = tics.shape sp = {'x': [], 'y': [], 'id': [], 'color': []} for i in range(ncols): sp['x'].append(crt / 60) sp['y'].append(tics[:, i]) sp['id'].append(datasets[i]) sp['color'] = colorgenerator(ncols) return sp
def cd(self, h5path): h5path = abs_hdf5_path(h5path, self.h5path) if h5path in h5file: item = h5file[h5path] if isinstance(item, h5py.Group): self.current_group = item self.h5path = h5path printlog('Current path set to %s' % h5path) else: printlog('Error! Path %s does not point to a group!' % h5path) else: printlog('Error! Path %s does not exist!' % h5path)
def export_metadata_table_to_file(dbfilepath, h5readpath, output_prefix, dataset_names, samples): printlog('Exporting associated metadata...'); metacolumns = set(); if not ('*' in samples): samples = set(samples); dn = []; #print(samples) #print(dataset_names) for dataset_name in dataset_names: if dataset_name.lstrip('/') in samples: dn.append(dataset_name); if not dn: printlog('No samples found matching the provided sample list!'); return dataset_names = dn; with h5py.File(dbfilepath, 'r') as h5file: for dataset_name in dataset_names: group_name = h5readpath[:-1] + dataset_name; group = h5file[group_name]; if 'MetaData' in group: metagroup = group['MetaData']; for attribute in metagroup.attrs.keys(): metacolumns.add(attribute); metacolumns = sorted(list(metacolumns)); if metacolumns: with open('%s_metadata.csv'%output_prefix, 'w') as fout: fout.write('Sample'); for s in metacolumns: fout.write(',"%s"'%s); fout.write('\n'); for dataset_name in dataset_names: group_name = h5readpath[:-1] + dataset_name; group = h5file[group_name]; fout.write('"%s"'%dataset_name.lstrip('/')); if 'MetaData' in group: metagroup = group['MetaData']; for attribute in metacolumns: if attribute in metagroup.attrs: value = metagroup.attrs[attribute]; else: value = ''; fout.write(',"%s"'%value); fout.write('\n'); printlog('Done')
metids = row[1:] else: sampleid = row[0] metvals = row[1:] minlen = min(len(metids), len(metvals)) write_meta_values(h5, h5writepath, str(sampleid), metids[0:minlen], metvals[0:minlen], parameters['overwrite'] == 'yes') if __name__ == "__main__": tic() settings = OptionsHolder(__doc__, ImportMetaSet_options) settings.description = 'Metadata import' settings.do = 'yes' printlog(settings.program_description) #Parse command line parameters try: settings.parse_command_line_args() except Exception as inst: printlog('!!! Error in command line parameters: !!!') printlog(inst) printlog('\nRun python ' + sys.argv[0] + ' --help for command line options information!') sys.exit(-1) parameters = settings.parameters if parameters['logfile'] != '': start_log( parameters['logfile'], overwrite_existing=(parameters['overwrite_logfile'] == 'yes'),
def do_profile_alignment(dbfilepath, method='rspa', params = {'recursion':1, 'minsegwidth':100, 'maxpeakshift':10, 'reference':'mean', 'h5readpath': '/proc', 'h5writepath': '/proc'}, istrain=1): """ Performs advanced adjustment for chromatographic peak position variations at full profile resolution using recursive segment-wise peak alignment strategy Args: dbfilepath: The database file path method: The choice of peak alignment method. Default value: 'rspa', i.e. Recursive segment-wise peak alignment. params: The dictionary of peak alignment parameters """ params['h5writepath'] = h5Base.correct_h5path(params['h5writepath']) params['h5readpath'] = h5Base.correct_h5path(params['h5readpath']) dataset_names = mh5.get_dataset_names(dbfilepath,dataset_names=[],pathinh5 = params['h5readpath'][:-1]) if not dataset_names: return peak_width = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'peak_width'); printlog('Loaded min estimated peak width: %s seconds'%peak_width); mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\ data=peak_width) if str(params['minsegwidth']).lower() == 'auto': params['minsegwidth'] = peak_width * 10.0; printlog('Parameter "minsegwidth" is set to %s'%params['minsegwidth']); else: try: params['minsegwidth'] = float(params['minsegwidth']) except: raise LoggingValueError('Error! %s value for parameter "minsegwidth" cannot be converted to float!'%params['minsegwidth']) if str(params['maxpeakshift']).lower() == 'auto': params['maxpeakshift'] = peak_width * 5; printlog('Parameter "maxpeakshift" is set to %s'%params['maxpeakshift']); else: try: params['maxpeakshift'] = float(params['maxpeakshift']) except: raise LoggingValueError('Error! %s value for parameter "maxpeakshift" cannot be converted to float!'%params['maxpeakshift']) if istrain==1: rtrange = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'rtrange') if method=='rspa': rtAlObj = RSPA(method,params,rtrange) elif istrain==0: rtAlObj = RSPA() rtAlObj.load_procobj(dbfilepath,params['h5readpath']) rtAlObj.aling_h5(dbfilepath, dataset_names, params['h5readpath'] , params['h5writepath']) if istrain==1: #save into hdf5 database file rtAlObj.export() rtAlObj.save_procobj(dbfilepath,params['h5writepath']) rtAlObj.save_proc_meta(dbfilepath,params['h5writepath'],params['h5readpath']) return
def view_tics(dbfilepath, method='', params=''): printlog('\nExtracting TICs.....') h5readpath = params['h5readpath'] sp = get_data(dbfilepath, h5readpath) figtitle = 'Total Ion Chromatograms' dirname = os.path.dirname(dbfilepath) #screenwidth = 800 #screenheight = 600 # if method == 'pyplot': # # mydpi = 96 # plt.figure(figsize=(screenwidth / mydpi, screenheight / mydpi), dpi=mydpi) # plt.plot(crt, tics) # plt.xlabel('Time(s)') # plt.ylabel('Intensity') # plt.title(figtitle) # plt.show() if method == 'bokeh': from bokeh.plotting import figure, show, save, output_notebook, output_file, ColumnDataSource from bokeh.io import push_notebook from bokeh.models import HoverTool #if params['inline'] == 'yes': # params['inline'] = 'yes' # output_notebook() #else: if params['outputfile'] == '': filename = 'bp' + time.strftime("%H%M_%d_%m_%Y") + '.html' else: filename = params['outputfile'] output_file(os.path.join(dirname, filename)) source = ColumnDataSource(sp) p = figure(width=params['plot_width'], height=params['plot_height'], title=figtitle, x_axis_label='Time(mins)', y_axis_label='Intensity') p.multi_line(xs='x', ys='y', line_width=1, line_color='color', line_alpha=0.6, hover_line_color='color', hover_line_alpha=1.0, source=source) p.add_tools( HoverTool(show_arrow=False, line_policy='next', tooltips=[('id', '@id')])) try: p.toolbar.active_inspect = [None] except: pass #if params['inline'] == 'yes': # show(p) # push_notebook(handle=hnotebook) # push_notebook(handle=handle) #else: if params['display'] == 'yes': show(p) else: save(p)
def analyse(self): """ Performs analysis using ANOVA on the StatisticalModel instance supplied at the initialization step """ printlog('\nAnalysing...') #Initialize output for stat_model. Parameters and additional_groups need to be passed as arguments to prepare #stat_model instance to receive the results of analysis. self.stat_model.initialize_output( parameters = self.parameters, additional_groups = self.additional_groups ); #Iterate over the available data in stat_model. Each iteration deals with on of the selected rt peaks. #Each iteration returns data as a dictionary containing experimental values of #quantity integrals for the corresponding peak and the metadata values for selected metadata categories. #E.g. data = {'quantity_integrals':[1.1,2.2,1.3,4.8], 'dose':[1,2,1,2], 'time':[8,12,24,48]} for #model 'C(dose)*C(time)' #During iteration self.stat_model.current_index contains the index of the item of self.stat_model.rt_indeces array being processed. #self.stat_model.current_rt contains current rt value in seconds #self.stat_model.model_clean contains a clean string representation of your model, e.g. #for input model 'C(dose)*C(time)' you will have 'C(dose)+C(time)+C(dose):C(time)' for data in self.stat_model.data(): printlog('\n%s of %s: %.3f min.'%(self.stat_model.current_index + 1, len(self.stat_model.rt_indeces), self.stat_model.current_rt / 60.0)); #for ANOVA from statsmodels add quantity_integrals to the string model representation model_description = 'quantity_integrals ~ ' + '+'.join(self.stat_model.model_clean); #prepare dataframe dataframe = DataFrame(data = data); #fit linear model lm = ols(model_description, data = dataframe).fit(); printlog(lm.summary()); printlog('\n'); try: #Sometimes ANOVA can fail, thus try:except #Do ANOVA anova = sm.stats.anova_lm(lm, typ = 2, robust = self.robust); #results are converted from transposed dataframe to dictionary #This way your results will contain a dictionary of dictionaries #with groups and group combinations from the model as the key #for the first level dictionary and calculated parameters #as keys for the inner dictionaries, e.g. # results = { # 'C(Virus):C(Time)': {'PR(>F)': 0.66408762493566431, 'sum_sq': 56504705293.388962, 'F': 0.60495953227434596}, # 'C(Time)' : {'PR(>F)': 0.20201607440256636, 'sum_sq': 81742622757.011337, 'F': 1.7503313599529786}, # 'C(Virus)' : {'PR(>F)': 0.80706629815712594, 'sum_sq': 10130541575.940208, 'F': 0.21692238413226958}, # 'Residual' : {'PR(>F)': nan, 'sum_sq': 420311046036.94336, 'F': nan} # } results = anova.transpose().to_dict(); #For results calculate background color to be used in HTML report based on the values obtained results_color = {}; for key in results: #for each returned group result_set = results[key]; #get subgroup p_value = result_set['PR(>F)']; #get p-value if isnull(p_value): #if not calculable - set color to grey color = 0x505050; elif p_value <= self.p_value_threshold: #if less than p_value_threshold - the result color is green color = 0x90FF90; else: color = 0xFF9090;#if p_value_threshold is not passed - the result color is red #apply coloring to all results result_set_color = {} #initialize dictionary for coloring of the result_set results_color[key] = result_set_color; #add it to overall results_color for subkey in result_set: if isnull(result_set[subkey]): #if the result value for this group/subgroup is not calculable - the color is grey result_set_color[subkey] = 0x505050; else: result_set_color[subkey] = color; #otherwise the color is based on the p_value_threshold being satisfied. if not (isnull(p_value)) and p_value <= self.p_value_threshold: #Also add p_value_pass as one of the returned result values result_set['p_value_pass'] = 1.0; #set it to 1 if the p_value_threshold is satisfied result_set_color['p_value_pass'] = 0x90FF90;# color is green else: result_set['p_value_pass'] = 0.0; #0 otherwise result_set_color['p_value_pass'] = 0xFF9090;# color is red #send results and their colors to stat_model for storage and reporting. #stat_model instance will parse and display them according to the parameters and additional groups provided in #self.stat_model.initialize_output(parameters, additional_groups) self.stat_model.store_results(results, results_color); except Exception as inst: printlog('ANOVA failed!'); printlog(inst); traceback.print_exc();
def do_noisefilter(dbfilepath, smoothmethod='sqfilter', smoothparams={ 'window': 5, 'degree': 3 }, baselinemethod='tophat', baselineparams={'frame': 90}, params={ 'h5readpath': '/sp2D', 'h5writepath': '/spproc2D' }, istrain=1): """ Performs adjustment for high frequency noise and lower frequency baseline distortions due to a variety of instrumental and experimental reasons Args: dbfilepath: a user-specified path to the h5 database file smoothmethod: The type of noise filtering method. Default value: 'sqfilter', i.e. the Savitzky-Golay filter. smoothparams: The dictionary of parameter arguments for noise filtering method baselinemethod: The type of a baseline correction method. Default value: 'tophat', i.e. the top-hat morphological filter. baselineparams: The dictionary of parameter arguments for baseline correction method """ params['h5writepath'] = h5Base.correct_h5path(params['h5writepath']) params['h5readpath'] = h5Base.correct_h5path(params['h5readpath']) dataset_names = mh5.get_dataset_names(dbfilepath, dataset_names=[], pathinh5=params['h5readpath'][:-1]) if not dataset_names: return peak_width = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'peak_width') printlog('Loaded min estimated peak width: %s seconds' % peak_width) mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\ data=peak_width) if str(smoothparams['window']).lower() == 'auto': smoothparams['window'] = peak_width * 0.5 printlog('Parameter "window" is set to %s' % smoothparams['window']) else: try: smoothparams['window'] = float(smoothparams['window']) except: raise LoggingValueError( 'Error! %s value for parameter "window" cannot be converted to float!' % smoothparams['window']) if str(baselineparams['frame']).lower() == 'auto': baselineparams['frame'] = peak_width * 15 printlog('Parameter "frame" is set to %s' % baselineparams['frame']) else: try: baselineparams['frame'] = float(baselineparams['frame']) except: raise LoggingValueError( 'Error! %s value for parameter "frame" cannot be converted to float!' % baselineparams['frame']) if istrain == 1: rtrange = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'rtrange') if smoothmethod != 'none': SmoothObject = SmoothFilter(smoothmethod, smoothparams, rtrange) else: SmoothObject = [] if baselinemethod != 'none': BaselineObject = BaselineFilter(baselinemethod, baselineparams, rtrange) else: BaselineObject = [] elif istrain == 0: SmoothObject = SmoothFilter() SmoothObject.load_procobj(dbfilepath, params['h5readpath']) if SmoothObject.method == '': SmoothObject = [] BaselineObject = BaselineFilter() if BaselineObject.method == '': BaselineObject = [] BaselineObject.load_procobj(dbfilepath, params['h5readpath']) filternoise_h5(dbfilepath, dataset_names, SmoothObject, BaselineObject, params) if istrain == 1: #save into hdf5 database file if (SmoothObject): SmoothObject.export(rtrange) SmoothObject.save_procobj(dbfilepath, params['h5writepath']) if (BaselineObject): BaselineObject.export(rtrange) BaselineObject.save_procobj(dbfilepath, params['h5writepath']) SmoothObject.save_proc_meta(dbfilepath, params['h5writepath'], params['h5readpath']) return