def load_imzml_data_set(file): """ FLAG=0: SEND TO CSV, RETURN NOTHING FLAG=1: RETURN DICT OF DATAFRAMES FLAG=2: SEND TO CSV, RETURN DICT OF DATAFRAMES :param file: :param flag: :return: """ imzml_data_path = os.path.join(data_path_imzml, file) p = ImzMLParser(imzml_data_path) mass_data = {} intensity_data = {} x_cord, y_cord = p.coordinates[-1][0], p.coordinates[-1][1] for idx, (x, y, z) in enumerate(p.coordinates): # mzs are masses over charge of 1 ion # intensities correspond to the abundance of the particular ion mzs, intensities = p.getspectrum(idx) mass_data[idx] = mzs intensity_data[idx] = intensities # CONVERT DICTS TO DATA FRAMES df_mass_data = pd.DataFrame(mass_data) df_intensity_data = pd.DataFrame(intensity_data) f_name = file.split('.')[0] return {"mass": df_mass_data, "intensity": df_intensity_data, "x": x_cord, "y": y_cord, "f_name": f_name}
def imzml_to_sbd(filepath_imzml, filepath_sbd): """Converts a pair of .imzml and .ibd files to .sbd Returns: list:True on success """ with open(filepath_sbd, 'wb') as out_file: p = ImzMLParser(filepath_imzml) n_spectra = len(p.coordinates) # First pass meta = [] offset = 20 * n_spectra + 10 for idx, (x,y,z) in enumerate(p.coordinates): (mzs, intensities) = p.getspectrum(idx) n_points = len(mzs) meta.append((offset, n_points, np.sum(intensities), x, y)) offset = offset + n_points * 12 # Write data to stream... header = (0, n_spectra, 8) out_file.write(struct.pack('<BQB', header[0], header[1], header[2])) for meta_item in meta: out_file.write(struct.pack('<QLfHH', meta_item[0], meta_item[1], meta_item[2], meta_item[3], meta_item[4])) # Second pass for i in range(n_spectra): mzs, intensities = p.getspectrum(i) write_spectrum(out_file, (mzs, intensities)) return True
def __read_all(self, filename): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes. """ self.data = np.zeros(shape=self.shape, dtype=self.data_type) log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape]) reader = ImzMLParser(filename) log_helper.debug(__name__,'READING ALL DATA!! GIVE ME RAM (please)!') # Compute the bin edges for reinterpolation if needed if self.imzml_type == self.available_imzml_types['processed']: shift = np.diff(self.mz).mean() bin_edges = np.append(self.mz, self.mz[-1]+ shift) else: bin_edges = None for ind in xrange(0, len(reader.coordinates)): xidx, yidx = reader.coordinates[ind] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min # Read the spectrum mz, intens = reader.getspectrum(ind) # Reinterpolate intensities if we are in processed mode if bin_edges is not None: f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False) intens = f(self.mz) #intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) # Save the intensity values in our data cube self.data[xidx, yidx, :] = intens
def write_corrected_msi(msi, output_file, tolerance, database_exactmass, step, dalim): # iterate throug each pixel of an MSI with ImzMLWriter(output_file) as w: p = ImzMLParser(msi, parse_lib='ElementTree') for idx, (x, y, z) in enumerate(p.coordinates): ms_mzs, ms_intensities = p.getspectrum(idx) peaks_ind = peak_selection(ms_intensities) peaks_mz = ms_mzs[peaks_ind] if len(peaks_mz) > 30: hit_exp, hit_errors = hits_generation(peaks_mz, database_exactmass, tolerance) if len(hit_errors) > 10: roi = hits_selection(hit_errors, step, tolerance, da_limit=dalim) if np.sum(roi) > 10: mz_error_model = create_lm(hit_exp, hit_errors, tolerance=tolerance, da_limit=dalim, step=step) if mz_error_model: corrected_mzs = correct_mz_lm( ms_mzs, mz_error_model) w.addSpectrum(corrected_mzs, ms_intensities, (x, y, z))
def spectrum_iter(self): """ Generator function that yields a position and associated spectrum for a selected datacube type. :yield: (xidx, yidx) a tuple of ints representing x and y position in the image :yield: yi, a numpy 1D-array of floats containing spectral intensities at the given position and for the selected datacube type """ reader = ImzMLParser(self.basename) for idx in xrange(0, len(reader.coordinates)): xidx, yidx, zidx = reader.coordinates[idx] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min mz, intens = reader.getspectrum(idx) # Rehistogram the data if we are in procesed mode if self.imzml_type == self.available_imzml_types['processed']: # shift = np.diff(self.mz).mean() # bin_edges = np.append(self.mz, self.mz[-1]+ shift) f = interpolate.interp1d(mz, intens, fill_value=0, bounds_error=False) intens = f(self.mz) # intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) yield (xidx, yidx), np.asarray(intens)
def run(self): from pyimzml.ImzMLParser import ImzMLParser import json n_peaks = [] s_min = [] s_max = [] s_ptp = [] pcts = [5, 25, 50, 75, 95] s_pcts = [] p = ImzMLParser(self.imzml_filename) for i, (x, y, z_) in enumerate(p.coordinates): mzs, ints = p.getspectrum(i) n_peaks.append(len(mzs)) s_min.append(np.min(ints)) s_max.append(np.max(ints)) s_ptp.append(np.ptp(ints)) s_pcts.append(list(np.percentile(ints, pcts))) stats = { 'n_peaks': n_peaks, 's_min': s_min, 's_max': s_max, 's_ptp': s_ptp, 's_pcts': s_pcts } with open(self.output().path, 'w+') as f: json.dump(stats, f) print 'wrote spec stats'
def get_ds_spots(ds_id): parser = ImzMLParser(f'raw_datasets/{ds_id}.imzML') grid_mask = np.load(f'spotting/grids/{ds_id}.npy') mask_names = json.load(open(f'spotting/grids/{ds_id}_mask_names.json')) # Make a mapping of coordinate -> spectrum index coords = np.array(parser.coordinates)[:, :2] base_coord = np.min(coords, axis=0) coord_to_idx = np.ones(np.max(coords, axis=0) - base_coord + 1, dtype='i') * -1 for i, (x, y) in enumerate(coords): coord_to_idx[x - base_coord[0], y - base_coord[1]] = i # Collect spectra for each mask item spots = {} for i, mask_name in enumerate(mask_names): if mask_name != 'background': spectra_ys, spectra_xs = np.nonzero(grid_mask == i) spectra = [ parser.getspectrum(idx) for idx in coord_to_idx[spectra_xs, spectra_ys] ] norm_spectra = [(mzs, ints * 1e6 / np.sum(ints)) for mzs, ints in spectra] mzs, ints = merge_spectra(norm_spectra) spots[mask_name] = mzs, ints, len(norm_spectra) return spots
def __read_all(self, filename): """ Internal helper function used to read all data. The function directly modifies the self.data entry. Data is now a list of datacubes. """ self.data = np.zeros(shape=self.shape, dtype=self.data_type) log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape]) reader = ImzMLParser(filename) log_helper.debug(__name__, 'READING ALL DATA!! GIVE ME RAM (please)!') # Compute the bin edges for reinterpolation if needed if self.imzml_type == self.available_imzml_types['processed']: shift = np.diff(self.mz).mean() bin_edges = np.append(self.mz, self.mz[-1] + shift) else: bin_edges = None for ind in xrange(0, len(reader.coordinates)): xidx, yidx = reader.coordinates[ind] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min # Read the spectrum mz, intens = reader.getspectrum(ind) # Reinterpolate intensities if we are in processed mode if bin_edges is not None: intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) # Save the intensity values in our data cube self.data[xidx, yidx, :] = intens
def test_parser_get_spectrum(data_path, parse_lib): parser = ImzMLParser(data_path, parse_lib=parse_lib) for px in range(parser.n_pixels): mz_x, mz_y = parser.get_spectrum(px) assert len(mz_x) == len(mz_y) assert len(mz_x) > 0 assert len(mz_y) > 0
def __init__(self, imzml_path: pathlib.Path): try: imzml_parser = ImzMLParser(imzml_path, parse_lib="ElementTree") self.spectrum_reader = imzml_parser.portable_spectrum_reader() del imzml_parser except Exception as e: raise ImzMLError(format_exc()) from e self._stream = None
def __init__(self, filename, startX=1, startY=1, width=None, height=None, cropToData=False): self.imzML = ImzMLParser(filename) # Find the min and max row and column where data is present maxWidth = 0 maxHeight = 0 minWidth = -1 minHeight = -1 for (x, y, z) in self.imzML.coordinates: if x > maxWidth: maxWidth = x if y > maxHeight: maxHeight = y if minWidth == -1 or minWidth > x: minWidth = x if minHeight == -1 or minHeight > y: minHeight = y if cropToData: startX = minWidth startY = minHeight if width is None: width = maxWidth - startX + 1 if height is None: height = maxHeight - startY + 1 self.startX = startX self.startY = startY self.width = width self.height = height self.coordinates = [] self.cropToData = cropToData self.indexImage = np.ones((height, width), dtype=np.int) * -1 index = 0 for (x, y, z) in self.imzML.coordinates: if x >= startX and y >= startY and x < (startX + width) and y < ( startY + height): if cropToData: self.coordinates.append( (index, x - minWidth + 1, y - minHeight + 1)) self.indexImage[y - minHeight, x - minWidth] = index else: self.coordinates.append((index, x, y)) self.indexImage[y - startY, x - startX] = index index = index + 1
class IMSDataset: def __init__(self, fpath, micro_res=0.5, IMS_res=10): self.parser = ImzMLParser(fpath) self.micro_res = micro_res self.IMS_res = IMS_res self.IMS_px_in_micro = IMS_res / micro_res def __get_min_max_coords(self): coords = np.array(self.parser.coordinates) x_min, y_min, _ = np.min(coords, axis=0) x_max, y_max, _ = np.max(coords, axis=0) return x_min, y_min, x_max, y_max def to_columnar(self, mz_precision=4, dtype="uint32"): mzs, _ = self.parser.getspectrum(0) coords = np.array(dataset.parser.coordinates) x, y, _ = coords.T coords_df = pd.DataFrame( { "x": x, "y": y, "micro_x_topleft": x * self.IMS_px_in_micro - self.IMS_px_in_micro, "micro_y_topleft": y * self.IMS_px_in_micro - self.IMS_px_in_micro, "micro_px_width": np.repeat(self.IMS_px_in_micro, len(coords)), }, dtype=dtype, ) intensities = np.zeros((len(coords_df), len(mzs))) for i in range(len(coords)): _, coord_intensities = self.parser.getspectrum(i) intensities[i, :] = coord_intensities intensities = pd.DataFrame( intensities, columns=np.round(mzs, mz_precision).astype(str), dtype=dtype ) return coords_df.join(intensities) def to_array(self): x_min, y_min, x_max, y_max = self.__get_min_max_coords() mz_lengths = self.parser.mzLengths if not (mz_lengths.count(mz_lengths[0]) == len(mz_lengths)): raise ValueError("The number of m/z is not the same at each coordinate.") arr = np.zeros((x_max - x_min + 1, y_max - y_min + 1, mz_lengths[0])) for idx, (x, y, _) in enumerate(self.parser.coordinates): _, intensities = self.parser.getspectrum(idx) arr[x - x_min, y - y_min, :] = intensities return arr def write_zarr(self, path, dtype="i4"): arr = self.to_array() z_arr = zarr.open(path, mode="w", shape=arr.shape, compressor=None, dtype=dtype) z_arr[:, :, :] = arr
def get_spec(x, y1, y2, imzML_file): parser = ImzMLParser(imzML_file) part_map = dict() for y in range(y1, y2): try: idx = parser.coordinates.index((x, y, 1)) spec_map = tupel2map(parser.getspectrum(idx)) part_map[idx] = np.array(list(spec_map.values())) except: print(f"({x}, {y}, 1) is not in list.") return part_map
def __init__(self, path: Path): self.filename = find_file_by_ext(path, 'imzml') try: self._imzml_parser = ImzMLParser( self.filename, parse_lib='ElementTree', include_spectra_metadata=METADATA_FIELDS, ) except Exception as e: raise ImzMLError(format_exc()) from e super().__init__(self._imzml_parser)
def __init__(self, filename): """ Initialize Filtering Framework from an imzml file """ self.spectrum = ImzMLParser(filename) self.mzlist = [] self.intensity_list = [] self.filename = [] self.filter_spec_mass = np.zeros(np.shape(self.mzlist)) self.filter_spec_intens = np.zeros(np.shape(self.intensity_list)) for idx, (x, y, z) in enumerate(self.spectrum.coordinates): self.mzs, self.intensities = self.spectrum.getspectrum(idx) self.mzlist.append(self.mzs) self.intensity_list.append(self.intensities)
def __init__(self, storage: Storage, imzml_cobject: CloudObject, ibd_cobject: CloudObject): imzml_parser = ImzMLParser( storage.get_cloudobject(imzml_cobject, stream=True), ibd_file=None, parse_lib='ElementTree', include_spectra_metadata=METADATA_FIELDS, ) self._ibd_cobject = ibd_cobject self.imzml_reader = imzml_parser.portable_spectrum_reader() super().__init__(imzml_parser)
def test_parser_iter(data_path, parse_lib): parser = ImzMLParser(data_path, parse_lib=parse_lib) count = 0 for px, (mz_x, mz_y) in enumerate(parser): _mz_x, _mz_y = parser.get_spectrum(px) assert len(mz_x) == len(mz_y) assert len(mz_x) == len(_mz_x) assert len(mz_y) == len(_mz_y) assert_equal(_mz_x, mz_x) assert_equal(_mz_y, mz_y) count += 1 assert count == parser.n_pixels
def import_imzml_dataset(filepath): """Reads an .imzml and stores Returns: list:List of spectra """ p = ImzMLParser(filepath) spectra = [] for idx, (x,y,z) in enumerate(p.coordinates): mzs, intensities = p.getspectrum(idx) spectra.append(spectrum(mzs, intensities, x, y, z)) return spectra
def save_data_to_csv(filename): data_control_day_03 = os.path.join(data_path, filename) p = ImzMLParser(data_control_day_03) mass_data = {} intensity_data = {} for idx, (x, y, z) in enumerate(p.coordinates): # mzs are masses over charge of 1 ion # intensities correspond to the abundance of the particular ion mzs, intensities = p.getspectrum(idx) mass_data[idx] = mzs intensity_data[idx] = intensities df1 = pd.DataFrame(mass_data) df2 = pd.DataFrame(intensity_data) df1.to_csv('mass_data.csv') df2.to_csv('intensities.csv')
class ImzmlDataset(BaseDataset): def __init__(self, filename): from pyimzml.ImzMLParser import ImzMLParser super(ImzmlDataset, self).__init__(filename) self.imzml = ImzMLParser(filename) self.coordinates = np.asarray(self.imzml.coordinates) self.step_size = [1, 1, 1] #fixme get pixel size from header data def get_spectrum(self, ix): mzs, counts = self.imzml.getspectrum(ix) return [np.asarray(mzs), np.asarray(counts)] #todo return MassSpectrum def get_image(self, mz, tol): im = self.imzml.getionimage(mz, tol) return im
def on_pushButton_clicked(self): """ Slot documentation goes here. """ try: path = os.getcwd() file_name, _ = QFileDialog.getOpenFileName( self, u'Choose Imzml file', path, 'Imzml files (*.imzml)') if file_name: self.lineEdit_1.setText(file_name) self.progressBar = My_Progress_Form() self.progressBar.progressBar.setValue(0) self.progressBar.pushButton.setVisible(True) self.progressBar.pushButton.setText('Cancel') self.progressBar.pushButton.clicked.connect( self.thread_terminate) self.progressBar.show() self.p = ImzMLParser(self.lineEdit_1.text()) self.mbt = Average_mz_cal(self.p) self.mbt.trigger.connect(self.progress_update) self.mbt.trigger2.connect(self.avg_mz_plot) self.mbt.start() except Exception as e: m = 'Running error, info: ' + str(e) self.error(m)
def __init__(self, fname, specStart=0): #fname = "/mnt/d/dev/data/190724_AR_ZT1_Proteins/190724_AR_ZT1_Proteins_spectra.imzML" self.fname = fname self.parser = ImzMLParser(fname) self.dregions = None self.mzValues = self.parser.getspectrum(0)[0] self.specStart = specStart if self.specStart != 0: self.mzValues = self.mzValues[self.specStart:] print("WARNING: SPECTRA STARTING AT POSITION", self.specStart) self.find_regions()
class FSImzMLReader(ImzMLReader): def __init__(self, path: Path): self.filename = find_file_by_ext(path, 'imzml') try: self._imzml_parser = ImzMLParser( self.filename, parse_lib='ElementTree', include_spectra_metadata=METADATA_FIELDS, ) except Exception as e: raise ImzMLError(format_exc()) from e super().__init__(self._imzml_parser) def iter_spectra(self, sp_idxs: Sequence[int]): for sp_idx in sp_idxs: mzs, ints = self._imzml_parser.getspectrum(sp_idx) assert len(mzs) == self._imzml_parser.mzLengths[ sp_idx], 'Incomplete .ibd file' assert len(ints) == self._imzml_parser.intensityLengths[ sp_idx], 'Incomplete .ibd file' assert len(mzs) == len( ints), f"Spectrum {sp_idx} mz and intensity counts don't match" sp_idx, mzs, ints = self._process_spectrum(sp_idx, mzs, ints) yield sp_idx, mzs, ints
def test_writer_image(get_temp_path, data_mode): """Test adding image to the dataset""" mz_x = np.linspace(100, 1000, 20) coordinates = [ [1, 1, 1], [1, 2, 1], [1, 3, 1], [2, 1, 1], [2, 2, 1], [2, 3, 1], [3, 1, 1], [3, 2, 1], [3, 3, 1], ] mz_ys = np.random.rand(len(coordinates), mz_x.shape[0]) output_filename = os.path.join(get_temp_path, "test.imzML") with ImzMLWriter(output_filename, mode=data_mode) as imzml: for mz_y, _coordinates in zip(mz_ys, coordinates): imzml.add_spectrum(mz_x, mz_y, coords=_coordinates) with ImzMLParser(output_filename) as parser: for px, (_mz_x, _mz_y) in enumerate(parser): assert_array_almost_equal(_mz_x, mz_x, 4) assert_array_almost_equal(_mz_y, mz_ys[px], 4) assert parser.n_pixels == len(coordinates)
def __init__(self, imzml_filename, raw_data_filename, formulas_filename): self.imzml = ImzMLParser(imzml_filename) self.formulas_fn = formulas_filename if raw_data_filename.endswith(".h5"): self.raw = h5py.File(raw_data_filename) elif raw_data_filename.endswith(".RAW"): self.raw = str(raw_data_filename) else: raise ValueError("only .h5 and .RAW are supported") n_spectra = 25 logging.info("estimating resolution from %d random raw spectra..." % n_spectra) self.resolution_func = resolution_estimate(self.raw, n_spectra) logging.info("resolution is %d @ 200" % round(self.resolution_func(200))) self.mz_range = self.imzml.get_mz_range() logging.info("m/z range: %f .. %f" % self.mz_range) logging.info("generating isotope patterns...") self.patterns = generate_patterns(self.formulas_fn, self.resolution_func, self.mz_range) logging.info("computing mean spectrum...") mzs, self.mean_intensities = read_mean_spectrum(self.raw) logging.info("computing mean spectrum from centroided data...") self.mzs, self.intensities, self.frequencies = generate_summary_spectrum3(mzs, self.imzml) self.n = 5
def import_spectra(filepath, spectra_format="imzml"): ############### IMZML if spectra_format == "imzml" or spectra_format == "imzML": ##### Import the libraries install_required_packages("pyimzml") from pyimzml.ImzMLParser import ImzMLParser ##### Parse the imzML file parsed_imzml = ImzMLParser(filepath) ##### Generate the list of spectra spectra = [] for i,(x,y) in enumerate(parsed_imzml.coordinates): spectra.append(parsed_imzml.getspectrum(i)) ############### XMASS elif spectra_format == "brukerflex" or spectra_format == "xmass" or spectra_format == "Xmass": pass ############### Return the list of spectra return (spectra)
def collect_metadata(self): print('parsing imzML from %s' % self.path) with ImzMLParser(self.path) as parser: md = parser.imzmldict md = {k: (int(v) if type(v) == np.int64 else v) for k, v in md.items()} # for k, v in md.items(): # print(k, v, type(v)) return md
def load_and_split_ds_vm(storage, imzml_cobject, ibd_cobject, ds_segm_size_mb, sort_memory): stats = [] with TemporaryDirectory() as tmp_dir: logger.info("Temp dir is {}".format(tmp_dir)) imzml_dir = Path(tmp_dir) / 'imzml' res = imzml_dir.mkdir() logger.info("Create {} result {}".format(imzml_dir, res)) segments_dir = Path(tmp_dir) / 'segments' res = segments_dir.mkdir() logger.info("Create {} result {}".format(segments_dir, res)) logger.info('Downloading dataset...') t = time() imzml_path, ibd_path = download_dataset(imzml_cobject, ibd_cobject, imzml_dir, storage) stats.append(('download_dataset', time() - t)) logger.info('Loading parser...') t = time() imzml_parser = ImzMLParser(str(imzml_path)) imzml_reader = imzml_parser.portable_spectrum_reader() stats.append(('load_parser', time() - t)) logger.info('Defining segments bounds...') t = time() ds_segments_bounds = define_ds_segments( imzml_parser, ds_segm_size_mb=ds_segm_size_mb) segments_n = len(ds_segments_bounds) stats.append(('define_segments', time() - t)) logger.info('Segmenting...') t = time() chunks_n, ds_segms_len = make_segments(imzml_reader, ibd_path, ds_segments_bounds, segments_dir, sort_memory) stats.append(('dataset_segmentation', time() - t)) logger.info('Uploading segments...') t = time() ds_segms_cobjects = upload_segments(storage, segments_dir, chunks_n, segments_n) stats.append(('upload_segments', time() - t)) return imzml_reader, ds_segments_bounds, ds_segms_cobjects, ds_segms_len, stats
def test_parser_init_paths(data_path, parse_lib): parser = ImzMLParser(data_path, parse_lib=parse_lib) assert len(parser.coordinates) == 9 assert parser.n_pixels == 9 mz_x, mz_y = parser.get_spectrum(0) assert len(mz_x) == len(mz_y) assert len(mz_x) > 0 assert len(mz_y) > 0 mz_x, mz_y = parser.get_spectrum(4) assert len(mz_x) == len(mz_y) assert len(mz_x) == 8399 assert len(mz_y) == 8399 assert np.all(mz_x > 100.0) assert np.all(mz_x < 800.0) assert np.all(mz_y >= 0.0) assert np.all(mz_y < 3.0)
def test_parser_init_paths_as_with(data_path, parse_lib): with ImzMLParser(data_path, parse_lib=parse_lib) as parser: assert len(parser.coordinates) == 9 assert parser.n_pixels == 9 mz_x, mz_y = parser.get_spectrum(0) assert len(mz_x) == len(mz_y) assert len(mz_x) > 0 assert len(mz_y) > 0
def run(self): from pyimzml.ImzMLParser import ImzMLParser import json p = ImzMLParser(self.imzml_filename) im = {} for im_type in self.im_types: im[im_type] = np.zeros((p.imzmldict["max count of pixels y"], p.imzmldict["max count of pixels x"])) for i, (x, y, z_) in enumerate(p.coordinates): mzs, ints = p.getspectrum(i) for im_type in self.im_types: im[im_type][y - 1, x - 1] = getattr(np, im_type)(ints) for ii, im_type in enumerate(self.im_types): result = { 'im_vect': [_mz for _mz in im[im_type].flatten()], 'im_shape': np.shape(im[im_type]) } with open(self.output()[ii].path, 'w+') as f: json.dump(result, f)
def test_browse(data_path, parse_lib, item_ids): parser = ImzMLParser(data_path, parse_lib=parse_lib) browser = browse(parser) assert browser all_item_ids = set() for i in range(parser.n_pixels): all_item_ids.update(browser.for_spectrum(i).get_ids(item_ids)) assert len(all_item_ids) != 0
def test_parser_init_ibd_as_filename(imzml_path, ibd_path, parse_lib): with ImzMLParser(imzml_path, parse_lib=parse_lib, ibd_file=ibd_path) as parser: assert len(parser.coordinates) == 9 assert parser.n_pixels == 9 mz_x, mz_y = parser.get_spectrum(0) assert len(mz_x) == len(mz_y) assert len(mz_x) > 0 assert len(mz_y) > 0
def spectrum_iter(self): """ Generator function that yields a position and associated spectrum for a selected datacube type. :yield: (xidx, yidx) a tuple of ints representing x and y position in the image :yield: yi, a numpy 1D-array of floats containing spectral intensities at the given position and for the selected datacube type """ reader = ImzMLParser(self.basename) for idx in xrange(0, len(reader.coordinates)): xidx, yidx, zidx = reader.coordinates[idx] # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0) xidx -= self.x_pos_min yidx -= self.y_pos_min mz, intens = reader.getspectrum(idx) # Rehistogram the data if we are in procesed mode if self.imzml_type == self.available_imzml_types['processed']: # shift = np.diff(self.mz).mean() # bin_edges = np.append(self.mz, self.mz[-1]+ shift) f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False) intens = f(self.mz) # intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens) yield (xidx, yidx), np.asarray(intens)
def main(argv): from pyimzml.ImzMLParser import ImzMLParser inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) except getopt.GetoptError: print('test.py -i <inputfile> -o <outputfile>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('test.py -i <inputfile> -o <outputfile>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg if inputfile == '': print('test.py -i <inputfile> -o <outputfile>') raise IOError('input file not specified') if outputfile=='': outputfile=inputfile+'.imzML' imzml = ImzMLParser(inputfile) spectra = [] with ImzMLWriter(outputfile, mz_dtype=np.float32, intensity_dtype=np.float32) as writer: for i, coords in enumerate(imzml.coordinates): mzs, intensities = imzml.getspectrum(i) writer.addSpectrum(mzs, intensities, coords) spectra.append((mzs, intensities, coords)) imzml = ImzMLParser(outputfile) spectra2 = [] for i, coords in enumerate(imzml.coordinates): mzs, intensities = imzml.getspectrum(i) spectra2.append((mzs, intensities, coords)) print(spectra[0] == spectra2[0])
def __init__(self, nmf_fn, layers_fn, imzml_fn): self._imzml = ImzMLParser(imzml_fn) with np.load(nmf_fn) as data: nx, ny = data['shape'] self._W = data['W'].reshape((nx, ny, -1)) self._H = data['H'] self._mz_axis = data['mz_axis'] self._norm_real = {} self._norm_simulated = {} self._norm_groundtruth = {} self._norm_noise = {} self._norm_diff = {} self._coords = {} for i, coords in enumerate(self._imzml.coordinates): self._coords[(coords[0], coords[1])] = i self._mz_bins = [] for mz, ppm in self._mz_axis: self._mz_bins.append(mz * (1.0 + 1e-6 * ppm))
class inMemoryIMS(): def __init__(self, filename, min_mz=0., max_mz=np.inf, min_int=0., index_range=[],cache_spectra=True,do_summary=True,norm=''): file_size = os.path.getsize(filename) self.load_file(filename, min_mz, max_mz, min_int, index_range=index_range,cache_spectra=cache_spectra,do_summary=do_summary,norm=norm) def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[],cache_spectra=True,do_summary=True,norm=[]): # parse file to get required parameters # can use thin hdf5 wrapper for getting data from file self.file_dir, self.filename = os.path.split(filename) self.filename, self.file_type = os.path.splitext(self.filename) self.file_type = self.file_type.lower() self.norm=norm if self.file_type == '.hdf5': import h5py self.hdf = h5py.File(filename, 'r') # Readonly, fie must exist if index_range == []: self.index_list = map(int, self.hdf['/spectral_data'].keys()) else: self.index_list = index_range elif self.file_type == '.imzml': from pyimzml.ImzMLParser import ImzMLParser self.imzml = ImzMLParser(filename) self.index_list=range(0,len(self.imzml.coordinates)) else: raise TypeError('File type not recogised: {}'.format(self.file_type)) self.max_index = max(self.index_list) self.coords = self.get_coords() step_size = self.get_step_size() cube = ion_datacube(step_size=step_size) cube.add_coords(self.coords) self.cube_pixel_indices = cube.pixel_indices self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns self.histogram_mz_axis = {} self.mz_min = 9999999999999. self.mz_max = 0. if any([cache_spectra,do_summary]) == True: # load data into memory self.mz_list = [] self.count_list = [] self.idx_list = [] if do_summary: self.mic=np.zeros((len(self.index_list),1)) self.tic=np.zeros((len(self.index_list),1)) for ii in self.index_list: # load spectrum, keep values gt0 (shouldn't be here anyway) this_spectrum = self.get_spectrum(ii) mzs, counts = this_spectrum.get_spectrum(source='centroids') if len(mzs) != len(counts): raise TypeError('length of mzs ({}) not equal to counts ({})'.format(len(mzs), len(counts))) # Enforce data limits valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int)) counts = counts[valid] mzs = mzs[valid] # record min/max if mzs[0]<self.mz_min: self.mz_min = mzs[0] if mzs[-1]>self.mz_max: self.mz_max = mzs[-1] # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded) if cache_spectra: self.mz_list.append(mzs) self.count_list.append(counts) self.idx_list.append(np.ones(len(mzs), dtype=int) * ii) #record summary values if do_summary: self.tic[ii]=sum(counts) self.mic[ii]=max(counts) print 'loaded spectra' if cache_spectra: self.mz_list = np.concatenate(self.mz_list) self.count_list = np.concatenate(self.count_list) self.idx_list = np.concatenate(self.idx_list) # sort by mz for fast image formation mz_order = np.argsort(self.mz_list) self.mz_list = self.mz_list[mz_order] self.count_list = self.count_list[mz_order] self.idx_list = self.idx_list[mz_order] # split binary searches into two stages for better locality self.window_size = 1024 self.mz_sublist = self.mz_list[::self.window_size].copy() print 'file loaded' def get_step_size(self): if self.file_type == '.imzml': return [1,1,1] else: return [] def get_coords(self): # wrapper for redirecting requests to correct parser if self.file_type == '.imzml': coords = self.get_coords_imzml() coords[:,[0, 1]] = coords[:,[1, 0]] elif self.file_type == '.hdf5': coords = self.get_coords_hdf5() return coords def get_coords_imzml(self):# get real world coordinates print('TODO: convert indices into real world coordinates') coords = np.asarray(self.imzml.coordinates) if len(self.imzml.coordinates[0]) == 2: #2D - append zero z-coord coords = np.concatenate((coords,np.zeros((len(coords),1))),axis=1) return coords def get_coords_hdf5(self): coords = np.zeros((len(self.index_list), 3)) for k in self.index_list: coords[k, :] = self.hdf['/spectral_data/' + str(k) + '/coordinates/'] return coords def get_spectrum(self,index): # wrapper for redirecting requests to correct parser if self.file_type == '.imzml': this_spectrum = self.get_spectrum_imzml(index) elif self.file_type == '.hdf5': this_spectrum = self.get_spectrum_hdf5(index) if self.norm != []: mzs,counts = this_spectrum.get_spectrum(source="centroids") if self.norm == 'TIC': counts = counts / np.sum(counts) elif self.norm == 'RMS': counts = counts / np.sqrt(np.mean(np.square(counts))) elif self.norm == 'MAD': counts = counts/np.median(np.absolute(counts - np.mean(counts))) this_spectrum.add_centroids(mzs,counts) return this_spectrum def get_spectrum_imzml(self,index): mzs, intensities = self.imzml.getspectrum(index) ## temp hack -> assume centroided this_spectrum = mass_spectrum() this_spectrum.add_centroids(mzs,intensities) return this_spectrum def get_spectrum_hdf5(self, index): import h5py this_spectrum = mass_spectrum() tmp_str = '/spectral_data/%d' % (index) try: this_spectrum.add_spectrum(self.hdf[tmp_str + '/mzs/'], self.hdf[tmp_str + '/intensities/']) got_spectrum = True except KeyError: got_spectrum = False try: this_spectrum.add_centroids(self.hdf[tmp_str + '/centroid_mzs/'], self.hdf[tmp_str + '/centroid_intensities/']) got_centroids = True except KeyError: got_centroids = False if not any([got_spectrum, got_centroids]): raise ValueError('No spectral data found in index {}'.format(index)) return this_spectrum def empty_datacube(self): data_out = ion_datacube() # add precomputed pixel indices data_out.coords = self.coords data_out.pixel_indices = self.cube_pixel_indices data_out.nRows = self.cube_n_row data_out.nColumns = self.cube_n_col return data_out def get_ion_image(self, mzs, tols, tol_type='ppm'): data_out = self.empty_datacube() def search_sort(mzs,tols): data_out = blank_dataout() idx_left = np.searchsorted(self.mz_list, mzs - tols, 'l') idx_right = np.searchsorted(self.mz_list, mzs + tols, 'r') for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right): if any((mz<self.mz_list[0],mz>self.mz_list[-1])): data_out.add_xic(np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol]) continue # slice list for code clarity mz_vect=self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out def search_bisect(mzs,tols): data_out = blank_dataout() for mz,tol in zip(mzs,tols): if any((mz<self.mz_list[0],mz>self.mz_list[-1])): data_out.add_xic(np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol]) continue mz_upper = mz + tol mz_lower = mz - tol il = bisect.bisect_left(self.mz_list,mz_lower) ir = bisect.bisect_right(self.mz_list,mz_upper) # slice list for code clarity mz_vect=self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out if type(mzs) not in (np.ndarray, list): mzs = np.asarray([mzs, ]) if tol_type == 'ppm': tols = tols * mzs / 1e6 # to m/z # Fast search for insertion point of mz in self.mz_list # First stage is looking for windows using the sublist idx_left = np.searchsorted(self.mz_sublist, mzs - tols, 'l') idx_right = np.searchsorted(self.mz_sublist, mzs + tols, 'r') for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right): l = max(il - 1, 0) * self.window_size r = ir * self.window_size # Second stage is binary search within the windows il = l + np.searchsorted(self.mz_list[l:r], mz - tol, 'l') ir = l + np.searchsorted(self.mz_list[l:r], mz + tol, 'r') # slice list for code clarity mz_vect=self.mz_list[il:ir] idx_vect = self.idx_list[il:ir] count_vect = self.count_list[il:ir] # bin vectors ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1) data_out.add_xic(ion_vect, [mz], [tol]) return data_out # Form histogram axis def generate_histogram_axis(self, ppm=1.): ppm_mult = ppm * 1e-6 mz_current = self.mz_min mz_list = [mz_current,] while mz_current <= self.mz_max: mz_current = mz_current + mz_current * ppm_mult mz_list.append(mz_current) self.histogram_mz_axis[ppm] = mz_list def get_histogram_axis(self, ppm=1.): try: mz_axis = self.histogram_mz_axis[ppm] except KeyError as e: print 'generating histogram axis for ppm {}'.format(ppm) self.generate_histogram_axis(ppm=ppm) return self.histogram_mz_axis[ppm] def generate_summary_spectrum(self, summary_type='mean', ppm=1.): hist_axis = self.get_histogram_axis(ppm=ppm) # calcualte mean along some m/z axis mean_spec = np.zeros(np.shape(hist_axis)) for ii in range(0, len(hist_axis) - 1): mz_upper = hist_axis[ii + 1] mz_lower = hist_axis[ii] idx_left = bisect.bisect_left(self.mz_list, mz_lower) idx_right = bisect.bisect_right(self.mz_list, mz_upper) # slice list for code clarity count_vect = self.count_list[idx_left:idx_right] if summary_type == 'mean': count_vect = self.count_list[idx_left:idx_right] mean_spec[ii] = np.sum(count_vect) elif summary_type == 'freq': idx_vect = self.idx_list[idx_left:idx_right] mean_spec[ii] = float(len(np.unique(idx_vect))) else: raise ValueError('Summary type not recognised; {}'.format(summary_type)) if summary_type == 'mean': mean_spec = mean_spec / len(self.index_list) elif summary_type == 'freq': mean_spec = mean_spec / len(self.index_list) return hist_axis, mean_spec def get_summary_image(self,summary_func='tic'): if summary_func not in ['tic','mic']: raise KeyError("requested type not in 'tic' mic'") data_out = ion_datacube() # add precomputed pixel indices data_out.coords = self.coords data_out.pixel_indices = self.cube_pixel_indices data_out.nRows = self.cube_n_row data_out.nColumns = self.cube_n_col data_out.add_xic(np.asarray(getattr(self, summary_func))[self.index_list], [0], [0]) return data_out
def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[],cache_spectra=True,do_summary=True,norm=[]): # parse file to get required parameters # can use thin hdf5 wrapper for getting data from file self.file_dir, self.filename = os.path.split(filename) self.filename, self.file_type = os.path.splitext(self.filename) self.file_type = self.file_type.lower() self.norm=norm if self.file_type == '.hdf5': import h5py self.hdf = h5py.File(filename, 'r') # Readonly, fie must exist if index_range == []: self.index_list = map(int, self.hdf['/spectral_data'].keys()) else: self.index_list = index_range elif self.file_type == '.imzml': from pyimzml.ImzMLParser import ImzMLParser self.imzml = ImzMLParser(filename) self.index_list=range(0,len(self.imzml.coordinates)) else: raise TypeError('File type not recogised: {}'.format(self.file_type)) self.max_index = max(self.index_list) self.coords = self.get_coords() step_size = self.get_step_size() cube = ion_datacube(step_size=step_size) cube.add_coords(self.coords) self.cube_pixel_indices = cube.pixel_indices self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns self.histogram_mz_axis = {} self.mz_min = 9999999999999. self.mz_max = 0. if any([cache_spectra,do_summary]) == True: # load data into memory self.mz_list = [] self.count_list = [] self.idx_list = [] if do_summary: self.mic=np.zeros((len(self.index_list),1)) self.tic=np.zeros((len(self.index_list),1)) for ii in self.index_list: # load spectrum, keep values gt0 (shouldn't be here anyway) this_spectrum = self.get_spectrum(ii) mzs, counts = this_spectrum.get_spectrum(source='centroids') if len(mzs) != len(counts): raise TypeError('length of mzs ({}) not equal to counts ({})'.format(len(mzs), len(counts))) # Enforce data limits valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int)) counts = counts[valid] mzs = mzs[valid] # record min/max if mzs[0]<self.mz_min: self.mz_min = mzs[0] if mzs[-1]>self.mz_max: self.mz_max = mzs[-1] # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded) if cache_spectra: self.mz_list.append(mzs) self.count_list.append(counts) self.idx_list.append(np.ones(len(mzs), dtype=int) * ii) #record summary values if do_summary: self.tic[ii]=sum(counts) self.mic[ii]=max(counts) print 'loaded spectra' if cache_spectra: self.mz_list = np.concatenate(self.mz_list) self.count_list = np.concatenate(self.count_list) self.idx_list = np.concatenate(self.idx_list) # sort by mz for fast image formation mz_order = np.argsort(self.mz_list) self.mz_list = self.mz_list[mz_order] self.count_list = self.count_list[mz_order] self.idx_list = self.idx_list[mz_order] # split binary searches into two stages for better locality self.window_size = 1024 self.mz_sublist = self.mz_list[::self.window_size].copy() print 'file loaded'
class CleanImageSearch(object): def __init__(self, imzml_filename, raw_data_filename, formulas_filename): self.imzml = ImzMLParser(imzml_filename) self.formulas_fn = formulas_filename if raw_data_filename.endswith(".h5"): self.raw = h5py.File(raw_data_filename) elif raw_data_filename.endswith(".RAW"): self.raw = str(raw_data_filename) else: raise ValueError("only .h5 and .RAW are supported") n_spectra = 25 logging.info("estimating resolution from %d random raw spectra..." % n_spectra) self.resolution_func = resolution_estimate(self.raw, n_spectra) logging.info("resolution is %d @ 200" % round(self.resolution_func(200))) self.mz_range = self.imzml.get_mz_range() logging.info("m/z range: %f .. %f" % self.mz_range) logging.info("generating isotope patterns...") self.patterns = generate_patterns(self.formulas_fn, self.resolution_func, self.mz_range) logging.info("computing mean spectrum...") mzs, self.mean_intensities = read_mean_spectrum(self.raw) logging.info("computing mean spectrum from centroided data...") self.mzs, self.intensities, self.frequencies = generate_summary_spectrum3(mzs, self.imzml) self.n = 5 def find_good_matches(self, min_peaks=3, min_intensity_share=0.99, min_iso_corr=0.95): result = find_clean_molecules(self.mzs, self.intensities, self.patterns, min_peaks=min_peaks, min_intensity_share=min_intensity_share, min_iso_corr=min_iso_corr) molecules = sorted(result, key=lambda k:self.patterns[k][0][0]) matches = [] for f, a in molecules: match = SpectralMatch(f, a, self.patterns, self.mzs, self.intensities, self.resolution_func) if len(match.theor_mzs) < self.n: continue matches.append(match) return sorted(matches, key = lambda m: m.theor_mzs[0]) def _sf2m(self, formulas): if len(formulas) == 0: return [] if isinstance(formulas[0], tuple) and isinstance(formulas[0][0], str): return [SpectralMatch(f, a, self.patterns, self.mzs, self.intensities, self.resolution_func)\ for f, a in formulas] else: return formulas def extract_images(self, formulas, n_bins=15): formulas = self._sf2m(formulas) raw_images, nrow, ncol = _get_images(formulas, self.imzml, self.n, n_bins) offset = 0 images = [] for m in formulas: #l = min(self.n, m.peak_count) images.append(MolecularImage(raw_images[offset : offset + self.n, :], nrow, ncol, m.formula, m.adduct, self.patterns)) offset += self.n return images def extract_figures(self, formulas, min_img_corr=0.7, n_bins=15, **kwargs): formulas = self._sf2m(formulas) images, nrow, ncol = _get_images(formulas, self.imzml, self.n, n_bins) figures = [] offset = 0 for m in formulas: ims = m.image_correlation(images[offset : offset + self.n, :]) if ims < min_img_corr: offset += self.n continue img = MolecularFigure(self.n, m, images[offset : offset + self.n, :], nrow, ncol, self.mzs, self.intensities, self.mean_intensities, self.frequencies, **kwargs) figures.append(img) offset += self.n return figures
def __compute_file_info(cls, filename, resolution): ## TODO completely refactor this to make it smartly handle profile or centroid datasets ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution ## TODO: profile datasets should work as is ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array'] """ Internal helper function used to compute the mz axis, data type for the intensities, format type :return: Numpy array with mz axis :return: string with data type :return: imzml file type :return: """ reader = ImzMLParser(filename) # Read the first spectrum mz_axes, intens = reader.getspectrum(0) # NOTE: mz_axes is a tuple # Read the coordinates coordinates = np.asarray(reader.coordinates) # #Start the data at [0,0,0] # coordinates[:,0] = coordinates[:,0] - np.amin(coordinates,axis=0)[0] # coordinates[:,1] = coordinates[:,1] - np.amin(coordinates,axis=0)[1] # coordinates[:,2] = coordinates[:,2] - np.amin(coordinates,axis=0)[2] # Determine the data type for the internsity values dtype = np.asarray(intens).dtype.str # Compute the mz axis and file type file_type = cls.available_imzml_types['continuous'] min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes) for ind in range(coordinates.shape[0]): #for ind, loc in enumerate(reader.coordinates): mz, intens = reader.getspectrum(ind) if mz == mz_axes: pass else: file_type = cls.available_imzml_types['processed'] if min_mz > np.amin(mz): min_mz = np.amin(mz) if max_mz < np.amax(mz): max_mz = np.amax(mz) # Reinterpolate the mz-axis if we have a processed mode imzml file if file_type == cls.available_imzml_types['processed']: f = np.ceil(1e6 * np.log(max_mz/min_mz)/resolution) mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f) log_helper.info(__name__, "Reinterpolated m/z axis for processed imzML file") # Construct the imzml metadata information dataset_metadata = metadata_dict() instrument_metadata = metadata_dict() method_metadata = metadata_dict() for k, v in reader.imzmldict.iteritems(): dataset_metadata[k] = metadata_value(name=k, value=v, unit=None, description=k, ontology=None) # Delete the parser and read the metadata del reader # Parse the metadata for the file. We try to parse only the header and ignore the # <run > group in the XML file to avoid going throught the whole file again # while extracting the majority of the relevant metadata try: with open(filename, 'r') as ins: metdata_header = '' for line in ins: if '<run' in line: break else: metdata_header += line metdata_header += '</mzML>' metdata_header_dict = xmltodict.parse(metdata_header)['mzML'] for k, v in metdata_header_dict.iteritems(): store_value = metadata_value(name=k, value=v, unit=None, description=str(k) + " extracted from imzML XML header.", ontology=None) if k == 'instrumentConfigurationList': instrument_metadata[k] = store_value elif k == 'dataProcessingList': method_metadata[k] = store_value elif k == 'scanSettingsList': dataset_metadata[k] = store_value elif k == 'softwareList': method_metadata[k] = store_value elif k =='sampleList': method_metadata[k] = store_value else: dataset_metadata[k] = store_value dataset_metadata['imzml_xml_metadata_header'] = metadata_value(name='imzml_xml_metadata_header', value=metdata_header, unit=None, description='XML imzML header', ontology=None) except: log_helper.warning(__name__, "Extraction of additional imzML metadata failed") return coordinates, np.asarray(mz_axes), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata
class NoiseGenerator(object): def __init__(self, nmf_fn, layers_fn, imzml_fn): self._imzml = ImzMLParser(imzml_fn) with np.load(nmf_fn) as data: nx, ny = data['shape'] self._W = data['W'].reshape((nx, ny, -1)) self._H = data['H'] self._mz_axis = data['mz_axis'] self._norm_real = {} self._norm_simulated = {} self._norm_groundtruth = {} self._norm_noise = {} self._norm_diff = {} self._coords = {} for i, coords in enumerate(self._imzml.coordinates): self._coords[(coords[0], coords[1])] = i self._mz_bins = [] for mz, ppm in self._mz_axis: self._mz_bins.append(mz * (1.0 + 1e-6 * ppm)) # self._removeAssignedBins(layers_fn) def _removeAssignedBins(self, layers_fn): # buggy at the moment with open(layers_fn, 'rb') as f: layers = cPickle.load(f) for i in layers['layers_list']: assigned = layers['layers_list'][i]['assigned_mz_bins'] assigned = assigned[assigned < self._H[i].shape[0]] print "#assigned bins in component #{}: {}".format(i + 1, len(assigned)) h = np.zeros_like(self._H[i]) h[assigned] = self._H[i][assigned] self._H[i] = h def _getRealSpectrum(self, x, y): return self._imzml.getspectrum(self._coords[(x, y)]) def _norm(self, intensities): return np.linalg.norm(intensities) def generateNoise(self, x, y): real_spectrum = self._getRealSpectrum(x, y) real_mzs, real_intensities = map(np.array, real_spectrum) min_mz, max_mz = self._mz_bins[0], self._mz_bins[-1] inside_range = (real_mzs >= min_mz) & (real_mzs <= max_mz) real_mzs = real_mzs[inside_range] real_intensities = real_intensities[inside_range] bins = np.digitize(real_mzs, self._mz_bins) n_bins = len(self._mz_bins) binned_real_intensities = np.bincount(bins, real_intensities, n_bins) self._norm_real[(x, y)] = self._norm(binned_real_intensities) binned_approx_intensities = self._W[x, y, :].dot(self._H) noise = np.abs(binned_real_intensities - binned_approx_intensities) # FIXME: avoid duplicating noise noise_intensities = noise[bins] * args.inflate_noise noise_mzs = np.array(real_mzs) nnz = noise_intensities > min(real_intensities) / 2 return noise_mzs[nnz], noise_intensities[nnz] def addNoise(self, profile_spectrum, coords): spec = map(np.array, profile_spectrum) p = centroidize(*spec) mzs = np.array(p.masses) mult = spec[1].max() if len(spec[1]) > 0 else 1 intensities = np.array(p.abundances) * mult x, y = coords[:2] limit = min(self._getRealSpectrum(*coords)[1]) noise_mzs, noise_intensities = self.generateNoise(*coords) self._norm_noise[(x, y)] = self._norm(noise_intensities[noise_intensities > limit]) self._norm_groundtruth[(x, y)] = self._norm(intensities[intensities > limit]) self._norm_simulated[(x, y)] = self._norm_noise[(x, y)] + self._norm_groundtruth[(x, y)] self._norm_diff[(x, y)] = abs(self._norm_simulated[(x, y)] - self._norm_real[(x, y)]) mzs = np.concatenate([mzs, noise_mzs]) intensities = np.concatenate([intensities, noise_intensities]) detectable = np.where(intensities > limit)[0] mzs = mzs[detectable] intensities = intensities[detectable] order = mzs.argsort() return mzs[order], intensities[order] def saveStatistics(self, filename): def toRect(d): xs = [k[0] for k in d] ys = [k[1] for k in d] img = np.zeros((max(xs) + 1, max(ys) + 1)) for k in d: img[k[0], k[1]] = d[k] return img with open(filename, "w+") as f: np.savez(f, real=toRect(self._norm_real), simulated=toRect(self._norm_simulated), groundtruth=toRect(self._norm_groundtruth), noise=toRect(self._norm_noise), diff=toRect(self._norm_diff))
order = mzs.argsort() return mzs[order], intensities[order] def saveStatistics(self, filename): def toRect(d): xs = [k[0] for k in d] ys = [k[1] for k in d] img = np.zeros((max(xs) + 1, max(ys) + 1)) for k in d: img[k[0], k[1]] = d[k] return img with open(filename, "w+") as f: np.savez(f, real=toRect(self._norm_real), simulated=toRect(self._norm_simulated), groundtruth=toRect(self._norm_groundtruth), noise=toRect(self._norm_noise), diff=toRect(self._norm_diff)) ng = NoiseGenerator(args.nmf, args.layers, args.real) imzml_sim = ImzMLParser(args.simclean) with ImzMLWriter(args.output, mz_dtype=np.float32) as w: for i, coords in enumerate(imzml_sim.coordinates): noisy_mzs, noisy_intensities = ng.addNoise(imzml_sim.getspectrum(i), coords) w.addSpectrum(noisy_mzs, noisy_intensities, coords) ng.saveStatistics(args.output + ".norms")