def empty_datacube(self): data_out = ion_datacube() # add precomputed pixel indices data_out.coords = self.coords data_out.pixel_indices = self.cube_pixel_indices data_out.nRows = self.cube_n_row data_out.nColumns = self.cube_n_col return data_out
def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[]): # parse file to get required parameters # can use thin hdf5 wrapper for getting data from file self.file_dir, self.filename = file_type = os.path.splitext(filename) self.file_type = file_type self.hdf = h5py.File(filename, 'r') #Readonly, file must exist if index_range == []: self.index_list = map(int, self.hdf['/spectral_data'].keys()) else: self.index_list = index_range self.max_index = max(self.index_list) self.coords = self.get_coords() # precompute pixel indices for use in get_ion_image cube = ion_datacube() cube.add_coords(self.coords) self.cube_pixel_indices = cube.pixel_indices self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns # load data into memory self.mz_list = [] self.count_list = [] self.idx_list = [] for ii in self.index_list: # load spectrum, keep values gt0 (shouldn't be here anyway) this_spectrum = self.get_spectrum(ii) mzs, counts = this_spectrum.get_spectrum(source='centroids') if len(mzs) != len(counts): raise TypeError( 'length of mzs ({}) not equal to counts ({})'.format( len(mzs), len(counts))) # Enforce data limits valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int)) counts = counts[valid] mzs = mzs[valid] # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded) self.mz_list.append(mzs) self.count_list.append(counts) self.idx_list.append(np.ones(len(mzs), dtype=int) * ii) print 'loaded spectra' self.mz_list = np.concatenate(self.mz_list) self.count_list = np.concatenate(self.count_list) self.idx_list = np.concatenate(self.idx_list) # sort by mz for fast image formation mz_order = np.argsort(self.mz_list) self.mz_list = self.mz_list[mz_order] self.count_list = self.count_list[mz_order] self.idx_list = self.idx_list[mz_order] self.mz_min = self.mz_list[0] self.mz_max = self.mz_list[-1] self.histogram_mz_axis = {} print 'file loaded'
def get_summary_image(self,summary_func='tic'): if summary_func not in ['tic','mic']: raise KeyError("requested type not in 'tic' mic'") data_out = ion_datacube() # add precomputed pixel indices data_out.coords = self.coords data_out.pixel_indices = self.cube_pixel_indices data_out.nRows = self.cube_n_row data_out.nColumns = self.cube_n_col data_out.add_xic(np.asarray(getattr(self, summary_func))[self.index_list], [0], [0]) return data_out
def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[],cache_spectra=True): # parse file to get required parameters # can use thin hdf5 wrapper for getting data from file self.file_dir, self.filename = file_type = os.path.splitext(filename) self.file_type = file_type self.hdf = h5py.File(filename, 'r') # Readonly, file must exist if index_range == []: self.index_list = map(int, self.hdf['/spectral_data'].keys()) else: self.index_list = index_range self.max_index = max(self.index_list) self.coords = self.get_coords() # precompute pixel indices for use in get_ion_image cube = ion_datacube() cube.add_coords(self.coords) self.cube_pixel_indices = cube.pixel_indices self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns self.histogram_mz_axis = {} if cache_spectra == True: # load data into memory self.mz_list = [] self.count_list = [] self.idx_list = [] for ii in self.index_list: # load spectrum, keep values gt0 (shouldn't be here anyway) this_spectrum = self.get_spectrum(ii) mzs, counts = this_spectrum.get_spectrum(source='centroids') if len(mzs) != len(counts): raise TypeError('length of mzs ({}) not equal to counts ({})'.format(len(mzs), len(counts))) # Enforce data limits valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int)) counts = counts[valid] mzs = mzs[valid] # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded) self.mz_list.append(mzs) self.count_list.append(counts) self.idx_list.append(np.ones(len(mzs), dtype=int) * ii) print 'loaded spectra' self.mz_list = np.concatenate(self.mz_list) self.count_list = np.concatenate(self.count_list) self.idx_list = np.concatenate(self.idx_list) # sort by mz for fast image formation mz_order = np.argsort(self.mz_list) self.mz_list = self.mz_list[mz_order] self.count_list = self.count_list[mz_order] self.idx_list = self.idx_list[mz_order] self.mz_min = self.mz_list[0] self.mz_max = self.mz_list[-1] # split binary searches into two stages for better locality self.window_size = 1024 self.mz_sublist = self.mz_list[::self.window_size].copy() print 'file loaded'
def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[]): if filename.endswith('.hdf5'): self.hdf = h5py.File(filename, 'r') keys = index_range or map(int, self.hdf['/spectral_data'].keys()) else: self.imzml = ImzMLParser.ImzMLParser(filename) keys = index_range or range(len(self.imzml.coordinates)) self.coords = np.zeros((len(keys), 3)) def spectra_iter_hdf5(keys): for i in keys: tmp_str = "/spectral_data/" + str(i) mzs = self.hdf[tmp_str + '/centroid_mzs/'][()] counts = self.hdf[tmp_str + '/centroid_intensities/'][()] self.coords[i, :] = self.hdf[tmp_str + '/coordinates'] valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int)) counts = counts[valid] mzs = mzs[valid] yield (i, mzs, counts) def spectra_iter_imzml(keys): for i in keys: coords = self.imzml.coordinates[i] mzs, counts = map(np.array, self.imzml.getspectrum(i)) if len(coords) == 2: coords = (coords[0], coords[1], 0) self.coords[i, :] = coords yield (i, mzs, counts) sp_iter = spectra_iter_hdf5 if filename.endswith('.hdf5') else spectra_iter_imzml import reorder data = reorder.sortDatasetByMass(sp_iter(keys)) self.index_list, self.mz_list, self.count_list, self.idx_list = data self.max_index = max(self.index_list) cube = ion_datacube() cube.add_coords(self.coords) self.cube_pixel_indices = cube.pixel_indices self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns self.mz_min = self.mz_list[0] self.mz_max = self.mz_list[-1] self.histogram_mz_axis = {} # split binary searches into two stages for better locality self.window_size = 1024 self.mz_sublist = self.mz_list[::self.window_size].copy()
def get_datacube(reader, mzs, ppm): cube = ion_datacube() cube.xic = [] cube.nRows = reader.height cube.nColumns = reader.width cube.pixel_indices = None for mz in mzs: img = reader.get_mz_image(mz, ppm) if cube.pixel_indices is None: cube.pixel_indices = np.where(img.ravel() >= 0)[0] img = img.ravel()[cube.pixel_indices] img[img < 0] = 0.0 cube.xic.append(img) return cube
def get_ion_image(self, mz_list, tol_list, tol_type='mz', return_method='sum'): # todo - use tol_type and ensure tol is a vector # define mz ranges once if tol_type == ppm: tol_list = [tol * mz / 1e6 for mz, tol, in zip(mz_list, tol_list)] mz_list_upper = np.zeros(np.shape(mz_list)) mz_list_lower = np.zeros(np.shape(mz_list)) for mm in range(0, len(mz_list)): mz_list_upper[mm] = mz_list[mm] + tol_list[mm] mz_list_lower[mm] = mz_list[mm] - tol_list[mm] # sum intensities # todo - implement alternative return_method (e.g. max, median sum) xic_array = np.zeros((len(self.index_list), len(mz_list))) if self.consistent_mz == True: print "consistent" this_spectrum = self.get_spectrum(0) # precalculate which mzs should be made mz_index = np.zeros((len(this_spectrum.mzs), len(mz_list)), dtype=bool) for mm in range(0, len(mz_list)): mz_index[:, mm] = (this_spectrum.mzs < mz_list_upper[mm]) & ( this_spectrum.mzs > mz_list_lower[mm]) for ii in self.index_list: this_spectrum = self.get_spectrum(ii) for mm in range(0, len(mz_list)): xic_array[ii, mm] = sum( this_spectrum.intensities[mz_index[:, mm]]) else: print "inconsistent" for ii in self.index_list: this_spectrum = self.get_spectrum(ii) for mm in range(0, len(mz_list)): mz_index = (this_spectrum.mzs < mz_list_upper[mm]) & ( this_spectrum.mzs > mz_list_lower[mm]) xic_array[ii, mm] = sum(this_spectrum.intensities[mz_index]) data_out = ion_datacube() data_out.add_coords(self.coords) data_out.add_xic(xic_array, mz_list, tol_list) return data_out
def get_ion_image(self,mzs,tols,tol_type='ppm'): tols = tols*mzs/1e6 # to m/z data_out = ion_datacube() data_out.add_coords(self.coords) for mz,tol in zip(mzs,tols): mz_upper = mz + tol mz_lower = mz - tol print mz_lower print mz_upper idx_left = bisect.bisect_left(self.mz_list,mz_lower) idx_right = bisect.bisect_right(self.mz_list,mz_upper) # slice list for code clarity count_vect = np.concatenate((np.asarray([0]),self.count_list[idx_left:idx_right],np.asarray([0]))) idx_vect = np.concatenate((np.asarray([0]),self.idx_list[idx_left:idx_right],np.asarray([max(self.index_list)]))) # bin vectors ion_vect=np.bincount(idx_vect,count_vect) data_out.add_xic(ion_vect,[mz],[tol]) return data_out
def get_ion_image(self,mzs,tols,tol_type='ppm'): if tol_type=='ppm': tols = tols*mzs/1e6 # to m/z data_out = ion_datacube() # add precomputed pixel indices data_out.coords = self.coords data_out.pixel_indices = self.cube_pixel_indices data_out.nRows = self.cube_n_row data_out.nColumns = self.cube_n_col idx_left = np.searchsorted(self.mz_list, mzs - tols, 'l') idx_right = np.searchsorted(self.mz_list, mzs + tols, 'r') for mz,tol,il,ir in zip(mzs,tols,idx_left,idx_right): # slice list for code clarity idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect=np.bincount(idx_vect,count_vect,minlength=self.max_index+1) data_out.add_xic(ion_vect,[mz],[tol]) return data_out
def get_ion_image(self, mzs, tols, tol_type='ppm'): if tol_type == 'ppm': tols = tols * mzs / 1e6 # to m/z data_out = ion_datacube() # add precomputed pixel indices data_out.coords = self.coords data_out.pixel_indices = self.cube_pixel_indices data_out.nRows = self.cube_n_row data_out.nColumns = self.cube_n_col idx_left = np.searchsorted(self.mz_list, mzs - tols, 'l') idx_right = np.searchsorted(self.mz_list, mzs + tols, 'r') for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right): # slice list for code clarity idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out
def get_ion_image(self, mzs, tols, tol_type='ppm'): tols = tols * mzs / 1e6 # to m/z data_out = ion_datacube() data_out.add_coords(self.coords) for mz, tol in zip(mzs, tols): mz_upper = mz + tol mz_lower = mz - tol print mz_lower print mz_upper idx_left = bisect.bisect_left(self.mz_list, mz_lower) idx_right = bisect.bisect_right(self.mz_list, mz_upper) # slice list for code clarity count_vect = np.concatenate( (np.asarray([0]), self.count_list[idx_left:idx_right], np.asarray([0]))) idx_vect = np.concatenate( (np.asarray([0]), self.idx_list[idx_left:idx_right], np.asarray([max(self.index_list)]))) # bin vectors ion_vect = np.bincount(idx_vect, count_vect) data_out.add_xic(ion_vect, [mz], [tol]) return data_out
def get_ion_image(self,mz_list,tol_list,tol_type='mz',return_method='sum'): # todo - use tol_type and ensure tol is a vector # define mz ranges once if tol_type==ppm: tol_list = [tol*mz/1e6 for mz,tol, in zip(mz_list,tol_list)] mz_list_upper = np.zeros(np.shape(mz_list)) mz_list_lower = np.zeros(np.shape(mz_list)) for mm in range(0,len(mz_list)): mz_list_upper[mm] = mz_list[mm]+tol_list[mm] mz_list_lower[mm] = mz_list[mm]-tol_list[mm] # sum intensities # todo - implement alternative return_method (e.g. max, median sum) xic_array = np.zeros((len(self.index_list),len(mz_list))) if self.consistent_mz==True: print "consistent" this_spectrum = self.get_spectrum(0) # precalculate which mzs should be made mz_index = np.zeros((len(this_spectrum.mzs),len(mz_list)),dtype=bool) for mm in range(0,len(mz_list)): mz_index[:,mm] = (this_spectrum.mzs<mz_list_upper[mm]) & (this_spectrum.mzs>mz_list_lower[mm]) for ii in self.index_list: this_spectrum = self.get_spectrum(ii) for mm in range(0,len(mz_list)): xic_array[ii,mm] = sum(this_spectrum.intensities[mz_index[:,mm]]) else: print "inconsistent" for ii in self.index_list: this_spectrum = self.get_spectrum(ii) for mm in range(0,len(mz_list)): mz_index = (this_spectrum.mzs<mz_list_upper[mm]) & (this_spectrum.mzs>mz_list_lower[mm]) xic_array[ii,mm] = sum(this_spectrum.intensities[mz_index]) data_out = ion_datacube() data_out.add_coords(self.coords) data_out.add_xic(xic_array,mz_list,tol_list) return data_out
def _calculate_dimensions(self): cube = ion_datacube() cube.add_coords(self.coords) self.nrows = int(cube.nRows) self.ncols = int(cube.nColumns) self.pixel_indices = cube.pixel_indices
def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[],cache_spectra=True,do_summary=True,norm=[]): # parse file to get required parameters # can use thin hdf5 wrapper for getting data from file self.file_dir, self.filename = os.path.split(filename) self.filename, self.file_type = os.path.splitext(self.filename) self.file_type = self.file_type.lower() self.norm=norm if self.file_type == '.hdf5': import h5py self.hdf = h5py.File(filename, 'r') # Readonly, fie must exist if index_range == []: self.index_list = map(int, self.hdf['/spectral_data'].keys()) else: self.index_list = index_range elif self.file_type == '.imzml': from pyimzml.ImzMLParser import ImzMLParser self.imzml = ImzMLParser(filename) self.index_list=range(0,len(self.imzml.coordinates)) else: raise TypeError('File type not recogised: {}'.format(self.file_type)) self.max_index = max(self.index_list) self.coords = self.get_coords() step_size = self.get_step_size() cube = ion_datacube(step_size=step_size) cube.add_coords(self.coords) self.cube_pixel_indices = cube.pixel_indices self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns self.histogram_mz_axis = {} self.mz_min = 9999999999999. self.mz_max = 0. if any([cache_spectra,do_summary]) == True: # load data into memory self.mz_list = [] self.count_list = [] self.idx_list = [] if do_summary: self.mic=np.zeros((len(self.index_list),1)) self.tic=np.zeros((len(self.index_list),1)) for ii in self.index_list: # load spectrum, keep values gt0 (shouldn't be here anyway) this_spectrum = self.get_spectrum(ii) mzs, counts = this_spectrum.get_spectrum(source='centroids') if len(mzs) != len(counts): raise TypeError('length of mzs ({}) not equal to counts ({})'.format(len(mzs), len(counts))) # Enforce data limits valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int)) counts = counts[valid] mzs = mzs[valid] # record min/max if mzs[0]<self.mz_min: self.mz_min = mzs[0] if mzs[-1]>self.mz_max: self.mz_max = mzs[-1] # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded) if cache_spectra: self.mz_list.append(mzs) self.count_list.append(counts) self.idx_list.append(np.ones(len(mzs), dtype=int) * ii) #record summary values if do_summary: self.tic[ii]=sum(counts) self.mic[ii]=max(counts) print 'loaded spectra' if cache_spectra: self.mz_list = np.concatenate(self.mz_list) self.count_list = np.concatenate(self.count_list) self.idx_list = np.concatenate(self.idx_list) # sort by mz for fast image formation mz_order = np.argsort(self.mz_list) self.mz_list = self.mz_list[mz_order] self.count_list = self.count_list[mz_order] self.idx_list = self.idx_list[mz_order] # split binary searches into two stages for better locality self.window_size = 1024 self.mz_sublist = self.mz_list[::self.window_size].copy() print 'file loaded'
def get_ion_image(self, mzs, tols, tol_type='ppm'): data_out = ion_datacube() # add precomputed pixel indices data_out.coords = self.coords data_out.pixel_indices = self.cube_pixel_indices data_out.nRows = self.cube_n_row data_out.nColumns = self.cube_n_col def search_sort(mzs,tols): data_out = blank_dataout() idx_left = np.searchsorted(self.mz_list, mzs - tols, 'l') idx_right = np.searchsorted(self.mz_list, mzs + tols, 'r') for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right): if any((mz<self.mz_list[0],mz>self.mz_list[-1])): data_out.add_xic(np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol]) continue # slice list for code clarity mz_vect=self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out def search_bisect(mzs,tols): data_out = blank_dataout() for mz,tol in zip(mzs,tols): if any((mz<self.mz_list[0],mz>self.mz_list[-1])): data_out.add_xic(np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol]) continue mz_upper = mz + tol mz_lower = mz - tol il = bisect.bisect_left(self.mz_list,mz_lower) ir = bisect.bisect_right(self.mz_list,mz_upper) # slice list for code clarity mz_vect=self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out if type(mzs) not in (np.ndarray, list): mzs = np.asarray([mzs, ]) if tol_type == 'ppm': tols = tols * mzs / 1e6 # to m/z # Fast search for insertion point of mz in self.mz_list # First stage is looking for windows using the sublist idx_left = np.searchsorted(self.mz_sublist, mzs - tols, 'l') idx_right = np.searchsorted(self.mz_sublist, mzs + tols, 'r') for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right): l = max(il - 1, 0) * self.window_size r = ir * self.window_size # Second stage is binary search within the windows il = l + np.searchsorted(self.mz_list[l:r], mz - tol, 'l') ir = l + np.searchsorted(self.mz_list[l:r], mz + tol, 'r') # slice list for code clarity mz_vect=self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out