def load_imzml_data_set(file):
    """

    FLAG=0: SEND TO CSV, RETURN NOTHING
    FLAG=1: RETURN DICT OF DATAFRAMES
    FLAG=2: SEND TO CSV, RETURN DICT OF DATAFRAMES

    :param file:
    :param flag:
    :return:
    """
    imzml_data_path = os.path.join(data_path_imzml, file)
    p = ImzMLParser(imzml_data_path)
    mass_data = {}
    intensity_data = {}
    x_cord, y_cord = p.coordinates[-1][0], p.coordinates[-1][1]
    for idx, (x, y, z) in enumerate(p.coordinates):
        # mzs are masses over charge of 1 ion
        # intensities correspond to the abundance of the particular ion
        mzs, intensities = p.getspectrum(idx)
        mass_data[idx] = mzs
        intensity_data[idx] = intensities

    # CONVERT DICTS TO DATA FRAMES
    df_mass_data = pd.DataFrame(mass_data)
    df_intensity_data = pd.DataFrame(intensity_data)
    f_name = file.split('.')[0]

    return {"mass": df_mass_data, "intensity": df_intensity_data, "x": x_cord, "y":  y_cord, "f_name": f_name}
Ejemplo n.º 2
0
def imzml_to_sbd(filepath_imzml, filepath_sbd):
    """Converts a pair of .imzml and .ibd files to .sbd   
      Returns:
      list:True on success    
      """    
    with open(filepath_sbd, 'wb') as out_file:
        p = ImzMLParser(filepath_imzml)
        n_spectra = len(p.coordinates)
        
        # First pass
        meta = []
        offset = 20 * n_spectra + 10       
        for idx, (x,y,z) in enumerate(p.coordinates):
            (mzs, intensities) = p.getspectrum(idx)
            n_points = len(mzs)
            
            meta.append((offset, n_points, np.sum(intensities), x, y))
            offset = offset + n_points * 12
        
        # Write data to stream...
        header = (0, n_spectra, 8)    
        out_file.write(struct.pack('<BQB', header[0], header[1], header[2]))    
        
        for meta_item in meta:
            out_file.write(struct.pack('<QLfHH',
                                       meta_item[0], meta_item[1], 
                                       meta_item[2], meta_item[3],
                                       meta_item[4]))
         
        # Second pass    
        for i in range(n_spectra):
            mzs, intensities = p.getspectrum(i)
            write_spectrum(out_file, (mzs, intensities)) 
    
    return True
Ejemplo n.º 3
0
    def __read_all(self, filename):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes.
        """

        self.data = np.zeros(shape=self.shape, dtype=self.data_type)
        log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape])
        reader = ImzMLParser(filename)
        log_helper.debug(__name__,'READING ALL DATA!! GIVE ME RAM (please)!')

        # Compute the bin edges for reinterpolation if needed
        if self.imzml_type == self.available_imzml_types['processed']:
            shift = np.diff(self.mz).mean()
            bin_edges = np.append(self.mz, self.mz[-1]+ shift)
        else:
            bin_edges = None
        for ind in xrange(0, len(reader.coordinates)):
            xidx, yidx = reader.coordinates[ind]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            # Read the spectrum
            mz, intens = reader.getspectrum(ind)
            # Reinterpolate intensities if we are in processed mode
            if bin_edges is not None:
                f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False)
                intens = f(self.mz)
                #intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens)
            # Save the intensity values in our data cube
            self.data[xidx, yidx, :] = intens
def write_corrected_msi(msi, output_file, tolerance, database_exactmass, step,
                        dalim):
    # iterate throug each pixel of an MSI
    with ImzMLWriter(output_file) as w:
        p = ImzMLParser(msi, parse_lib='ElementTree')
        for idx, (x, y, z) in enumerate(p.coordinates):

            ms_mzs, ms_intensities = p.getspectrum(idx)
            peaks_ind = peak_selection(ms_intensities)
            peaks_mz = ms_mzs[peaks_ind]

            if len(peaks_mz) > 30:
                hit_exp, hit_errors = hits_generation(peaks_mz,
                                                      database_exactmass,
                                                      tolerance)
                if len(hit_errors) > 10:
                    roi = hits_selection(hit_errors,
                                         step,
                                         tolerance,
                                         da_limit=dalim)
                    if np.sum(roi) > 10:
                        mz_error_model = create_lm(hit_exp,
                                                   hit_errors,
                                                   tolerance=tolerance,
                                                   da_limit=dalim,
                                                   step=step)
                        if mz_error_model:
                            corrected_mzs = correct_mz_lm(
                                ms_mzs, mz_error_model)
                            w.addSpectrum(corrected_mzs, ms_intensities,
                                          (x, y, z))
Ejemplo n.º 5
0
    def spectrum_iter(self):
        """
        Generator function that yields a position and associated spectrum for a selected datacube type.
        :yield: (xidx, yidx) a tuple of ints representing x and y position in the image
        :yield: yi,          a numpy 1D-array of floats containing spectral intensities at the given position
                                and for the selected datacube type
        """
        reader = ImzMLParser(self.basename)
        for idx in xrange(0, len(reader.coordinates)):
            xidx, yidx, zidx = reader.coordinates[idx]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            mz, intens = reader.getspectrum(idx)
            # Rehistogram the data if we are in procesed mode
            if self.imzml_type == self.available_imzml_types['processed']:
                # shift = np.diff(self.mz).mean()
                # bin_edges = np.append(self.mz, self.mz[-1]+ shift)
                f = interpolate.interp1d(mz,
                                         intens,
                                         fill_value=0,
                                         bounds_error=False)
                intens = f(self.mz)
                # intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens)

            yield (xidx, yidx), np.asarray(intens)
Ejemplo n.º 6
0
    def run(self):
        from pyimzml.ImzMLParser import ImzMLParser
        import json
        n_peaks = []
        s_min = []
        s_max = []
        s_ptp = []
        pcts = [5, 25, 50, 75, 95]
        s_pcts = []
        p = ImzMLParser(self.imzml_filename)
        for i, (x, y, z_) in enumerate(p.coordinates):
            mzs, ints = p.getspectrum(i)
            n_peaks.append(len(mzs))
            s_min.append(np.min(ints))
            s_max.append(np.max(ints))
            s_ptp.append(np.ptp(ints))
            s_pcts.append(list(np.percentile(ints, pcts)))

        stats = {
            'n_peaks': n_peaks,
            's_min': s_min,
            's_max': s_max,
            's_ptp': s_ptp,
            's_pcts': s_pcts
        }
        with open(self.output().path, 'w+') as f:
            json.dump(stats, f)
        print 'wrote spec stats'
def get_ds_spots(ds_id):
    parser = ImzMLParser(f'raw_datasets/{ds_id}.imzML')
    grid_mask = np.load(f'spotting/grids/{ds_id}.npy')
    mask_names = json.load(open(f'spotting/grids/{ds_id}_mask_names.json'))

    # Make a mapping of coordinate -> spectrum index
    coords = np.array(parser.coordinates)[:, :2]
    base_coord = np.min(coords, axis=0)
    coord_to_idx = np.ones(np.max(coords, axis=0) - base_coord + 1,
                           dtype='i') * -1
    for i, (x, y) in enumerate(coords):
        coord_to_idx[x - base_coord[0], y - base_coord[1]] = i

    # Collect spectra for each mask item
    spots = {}
    for i, mask_name in enumerate(mask_names):
        if mask_name != 'background':
            spectra_ys, spectra_xs = np.nonzero(grid_mask == i)
            spectra = [
                parser.getspectrum(idx)
                for idx in coord_to_idx[spectra_xs, spectra_ys]
            ]
            norm_spectra = [(mzs, ints * 1e6 / np.sum(ints))
                            for mzs, ints in spectra]
            mzs, ints = merge_spectra(norm_spectra)
            spots[mask_name] = mzs, ints, len(norm_spectra)
    return spots
Ejemplo n.º 8
0
    def __read_all(self, filename):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes.
        """

        self.data = np.zeros(shape=self.shape, dtype=self.data_type)
        log_helper.info(__name__, 'Datacube shape is %s' % [self.data.shape])
        reader = ImzMLParser(filename)
        log_helper.debug(__name__, 'READING ALL DATA!! GIVE ME RAM (please)!')

        # Compute the bin edges for reinterpolation if needed
        if self.imzml_type == self.available_imzml_types['processed']:
            shift = np.diff(self.mz).mean()
            bin_edges = np.append(self.mz, self.mz[-1] + shift)
        else:
            bin_edges = None
        for ind in xrange(0, len(reader.coordinates)):
            xidx, yidx = reader.coordinates[ind]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            # Read the spectrum
            mz, intens = reader.getspectrum(ind)
            # Reinterpolate intensities if we are in processed mode
            if bin_edges is not None:
                intens, bin_edges_new = np.histogram(mz,
                                                     bins=bin_edges,
                                                     weights=intens)
            # Save the intensity values in our data cube
            self.data[xidx, yidx, :] = intens
Ejemplo n.º 9
0
 def test_parser_get_spectrum(data_path, parse_lib):
     parser = ImzMLParser(data_path, parse_lib=parse_lib)
     for px in range(parser.n_pixels):
         mz_x, mz_y = parser.get_spectrum(px)
         assert len(mz_x) == len(mz_y)
         assert len(mz_x) > 0
         assert len(mz_y) > 0
Ejemplo n.º 10
0
 def __init__(self, imzml_path: pathlib.Path):
     try:
         imzml_parser = ImzMLParser(imzml_path, parse_lib="ElementTree")
         self.spectrum_reader = imzml_parser.portable_spectrum_reader()
         del imzml_parser
     except Exception as e:
         raise ImzMLError(format_exc()) from e
     self._stream = None
Ejemplo n.º 11
0
    def __init__(self,
                 filename,
                 startX=1,
                 startY=1,
                 width=None,
                 height=None,
                 cropToData=False):
        self.imzML = ImzMLParser(filename)

        # Find the min and max row and column where data is present
        maxWidth = 0
        maxHeight = 0

        minWidth = -1
        minHeight = -1

        for (x, y, z) in self.imzML.coordinates:
            if x > maxWidth:
                maxWidth = x
            if y > maxHeight:
                maxHeight = y
            if minWidth == -1 or minWidth > x:
                minWidth = x
            if minHeight == -1 or minHeight > y:
                minHeight = y

        if cropToData:
            startX = minWidth
            startY = minHeight

        if width is None:
            width = maxWidth - startX + 1
        if height is None:
            height = maxHeight - startY + 1

        self.startX = startX
        self.startY = startY
        self.width = width
        self.height = height
        self.coordinates = []
        self.cropToData = cropToData

        self.indexImage = np.ones((height, width), dtype=np.int) * -1

        index = 0

        for (x, y, z) in self.imzML.coordinates:
            if x >= startX and y >= startY and x < (startX + width) and y < (
                    startY + height):
                if cropToData:
                    self.coordinates.append(
                        (index, x - minWidth + 1, y - minHeight + 1))
                    self.indexImage[y - minHeight, x - minWidth] = index
                else:
                    self.coordinates.append((index, x, y))
                    self.indexImage[y - startY, x - startX] = index

            index = index + 1
Ejemplo n.º 12
0
class IMSDataset:
    def __init__(self, fpath, micro_res=0.5, IMS_res=10):
        self.parser = ImzMLParser(fpath)
        self.micro_res = micro_res
        self.IMS_res = IMS_res
        self.IMS_px_in_micro = IMS_res / micro_res

    def __get_min_max_coords(self):
        coords = np.array(self.parser.coordinates)
        x_min, y_min, _ = np.min(coords, axis=0)
        x_max, y_max, _ = np.max(coords, axis=0)
        return x_min, y_min, x_max, y_max

    def to_columnar(self, mz_precision=4, dtype="uint32"):
        mzs, _ = self.parser.getspectrum(0)
        coords = np.array(dataset.parser.coordinates)
        x, y, _ = coords.T

        coords_df = pd.DataFrame(
            {
                "x": x,
                "y": y,
                "micro_x_topleft": x * self.IMS_px_in_micro - self.IMS_px_in_micro,
                "micro_y_topleft": y * self.IMS_px_in_micro - self.IMS_px_in_micro,
                "micro_px_width": np.repeat(self.IMS_px_in_micro, len(coords)),
            },
            dtype=dtype,
        )

        intensities = np.zeros((len(coords_df), len(mzs)))
        for i in range(len(coords)):
            _, coord_intensities = self.parser.getspectrum(i)
            intensities[i, :] = coord_intensities

        intensities = pd.DataFrame(
            intensities, columns=np.round(mzs, mz_precision).astype(str), dtype=dtype
        )

        return coords_df.join(intensities)

    def to_array(self):
        x_min, y_min, x_max, y_max = self.__get_min_max_coords()
        mz_lengths = self.parser.mzLengths
        if not (mz_lengths.count(mz_lengths[0]) == len(mz_lengths)):
            raise ValueError("The number of m/z is not the same at each coordinate.")

        arr = np.zeros((x_max - x_min + 1, y_max - y_min + 1, mz_lengths[0]))

        for idx, (x, y, _) in enumerate(self.parser.coordinates):
            _, intensities = self.parser.getspectrum(idx)
            arr[x - x_min, y - y_min, :] = intensities

        return arr

    def write_zarr(self, path, dtype="i4"):
        arr = self.to_array()
        z_arr = zarr.open(path, mode="w", shape=arr.shape, compressor=None, dtype=dtype)
        z_arr[:, :, :] = arr
Ejemplo n.º 13
0
def get_spec(x, y1, y2, imzML_file):
    parser = ImzMLParser(imzML_file)
    part_map = dict()
    for y in range(y1, y2):
        try:
            idx = parser.coordinates.index((x, y, 1))
            spec_map = tupel2map(parser.getspectrum(idx))
            part_map[idx] = np.array(list(spec_map.values()))
        except:
            print(f"({x}, {y}, 1) is not in list.")
    return part_map
Ejemplo n.º 14
0
    def __init__(self, path: Path):
        self.filename = find_file_by_ext(path, 'imzml')
        try:
            self._imzml_parser = ImzMLParser(
                self.filename,
                parse_lib='ElementTree',
                include_spectra_metadata=METADATA_FIELDS,
            )
        except Exception as e:
            raise ImzMLError(format_exc()) from e

        super().__init__(self._imzml_parser)
Ejemplo n.º 15
0
    def __init__(self, filename):
        """ Initialize Filtering Framework from an imzml file """
        self.spectrum = ImzMLParser(filename)
        self.mzlist = []
        self.intensity_list = []
        self.filename = []
        self.filter_spec_mass = np.zeros(np.shape(self.mzlist))
        self.filter_spec_intens = np.zeros(np.shape(self.intensity_list))

        for idx, (x, y, z) in enumerate(self.spectrum.coordinates):
            self.mzs, self.intensities = self.spectrum.getspectrum(idx)
            self.mzlist.append(self.mzs)
            self.intensity_list.append(self.intensities)
Ejemplo n.º 16
0
    def __init__(self, storage: Storage, imzml_cobject: CloudObject,
                 ibd_cobject: CloudObject):
        imzml_parser = ImzMLParser(
            storage.get_cloudobject(imzml_cobject, stream=True),
            ibd_file=None,
            parse_lib='ElementTree',
            include_spectra_metadata=METADATA_FIELDS,
        )

        self._ibd_cobject = ibd_cobject
        self.imzml_reader = imzml_parser.portable_spectrum_reader()

        super().__init__(imzml_parser)
Ejemplo n.º 17
0
    def test_parser_iter(data_path, parse_lib):
        parser = ImzMLParser(data_path, parse_lib=parse_lib)

        count = 0
        for px, (mz_x, mz_y) in enumerate(parser):
            _mz_x, _mz_y = parser.get_spectrum(px)
            assert len(mz_x) == len(mz_y)
            assert len(mz_x) == len(_mz_x)
            assert len(mz_y) == len(_mz_y)
            assert_equal(_mz_x, mz_x)
            assert_equal(_mz_y, mz_y)
            count += 1

        assert count == parser.n_pixels
Ejemplo n.º 18
0
def import_imzml_dataset(filepath):
    """Reads an .imzml and stores    
    Returns:
    list:List of spectra    
    """
    p = ImzMLParser(filepath)
    
    spectra = []
    
    for idx, (x,y,z) in enumerate(p.coordinates):
        mzs, intensities = p.getspectrum(idx)
        spectra.append(spectrum(mzs, intensities, x, y, z))
        
    return spectra
def save_data_to_csv(filename):
    data_control_day_03 = os.path.join(data_path, filename)
    p = ImzMLParser(data_control_day_03)
    mass_data = {}
    intensity_data = {}
    for idx, (x, y, z) in enumerate(p.coordinates):
        # mzs are masses over charge of 1 ion
        # intensities correspond to the abundance of the particular ion
        mzs, intensities = p.getspectrum(idx)
        mass_data[idx] = mzs
        intensity_data[idx] = intensities
    df1 = pd.DataFrame(mass_data)
    df2 = pd.DataFrame(intensity_data)
    df1.to_csv('mass_data.csv')
    df2.to_csv('intensities.csv')
Ejemplo n.º 20
0
class ImzmlDataset(BaseDataset):
    def __init__(self, filename):
        from pyimzml.ImzMLParser import ImzMLParser
        super(ImzmlDataset, self).__init__(filename)
        self.imzml = ImzMLParser(filename)
        self.coordinates = np.asarray(self.imzml.coordinates)
        self.step_size = [1, 1, 1]  #fixme get pixel size from header data

    def get_spectrum(self, ix):
        mzs, counts = self.imzml.getspectrum(ix)
        return [np.asarray(mzs), np.asarray(counts)]  #todo return MassSpectrum

    def get_image(self, mz, tol):
        im = self.imzml.getionimage(mz, tol)
        return im
Ejemplo n.º 21
0
 def on_pushButton_clicked(self):
     """
     Slot documentation goes here.
     """
     try:
         path = os.getcwd()
         file_name, _ = QFileDialog.getOpenFileName(
             self, u'Choose Imzml file', path, 'Imzml files (*.imzml)')
         if file_name:
             self.lineEdit_1.setText(file_name)
             self.progressBar = My_Progress_Form()
             self.progressBar.progressBar.setValue(0)
             self.progressBar.pushButton.setVisible(True)
             self.progressBar.pushButton.setText('Cancel')
             self.progressBar.pushButton.clicked.connect(
                 self.thread_terminate)
             self.progressBar.show()
             self.p = ImzMLParser(self.lineEdit_1.text())
             self.mbt = Average_mz_cal(self.p)
             self.mbt.trigger.connect(self.progress_update)
             self.mbt.trigger2.connect(self.avg_mz_plot)
             self.mbt.start()
     except Exception as e:
         m = 'Running error, info: ' + str(e)
         self.error(m)
Ejemplo n.º 22
0
    def __init__(self, fname, specStart=0):
        #fname = "/mnt/d/dev/data/190724_AR_ZT1_Proteins/190724_AR_ZT1_Proteins_spectra.imzML"

        self.fname = fname
        self.parser = ImzMLParser(fname)
        self.dregions = None

        self.mzValues = self.parser.getspectrum(0)[0]

        self.specStart = specStart

        if self.specStart != 0:
            self.mzValues = self.mzValues[self.specStart:]
            print("WARNING: SPECTRA STARTING AT POSITION", self.specStart)

        self.find_regions()
Ejemplo n.º 23
0
class FSImzMLReader(ImzMLReader):
    def __init__(self, path: Path):
        self.filename = find_file_by_ext(path, 'imzml')
        try:
            self._imzml_parser = ImzMLParser(
                self.filename,
                parse_lib='ElementTree',
                include_spectra_metadata=METADATA_FIELDS,
            )
        except Exception as e:
            raise ImzMLError(format_exc()) from e

        super().__init__(self._imzml_parser)

    def iter_spectra(self, sp_idxs: Sequence[int]):
        for sp_idx in sp_idxs:
            mzs, ints = self._imzml_parser.getspectrum(sp_idx)
            assert len(mzs) == self._imzml_parser.mzLengths[
                sp_idx], 'Incomplete .ibd file'
            assert len(ints) == self._imzml_parser.intensityLengths[
                sp_idx], 'Incomplete .ibd file'
            assert len(mzs) == len(
                ints), f"Spectrum {sp_idx} mz and intensity counts don't match"
            sp_idx, mzs, ints = self._process_spectrum(sp_idx, mzs, ints)
            yield sp_idx, mzs, ints
Ejemplo n.º 24
0
    def test_writer_image(get_temp_path, data_mode):
        """Test adding image to the dataset"""
        mz_x = np.linspace(100, 1000, 20)
        coordinates = [
            [1, 1, 1],
            [1, 2, 1],
            [1, 3, 1],
            [2, 1, 1],
            [2, 2, 1],
            [2, 3, 1],
            [3, 1, 1],
            [3, 2, 1],
            [3, 3, 1],
        ]
        mz_ys = np.random.rand(len(coordinates), mz_x.shape[0])

        output_filename = os.path.join(get_temp_path, "test.imzML")
        with ImzMLWriter(output_filename, mode=data_mode) as imzml:
            for mz_y, _coordinates in zip(mz_ys, coordinates):
                imzml.add_spectrum(mz_x, mz_y, coords=_coordinates)

        with ImzMLParser(output_filename) as parser:
            for px, (_mz_x, _mz_y) in enumerate(parser):
                assert_array_almost_equal(_mz_x, mz_x, 4)
                assert_array_almost_equal(_mz_y, mz_ys[px], 4)
                assert parser.n_pixels == len(coordinates)
    def __init__(self, imzml_filename, raw_data_filename, formulas_filename):
        self.imzml = ImzMLParser(imzml_filename)
        self.formulas_fn = formulas_filename
        if raw_data_filename.endswith(".h5"):
            self.raw = h5py.File(raw_data_filename)
        elif raw_data_filename.endswith(".RAW"):
            self.raw = str(raw_data_filename)
        else:
            raise ValueError("only .h5 and .RAW are supported")

        n_spectra = 25
        logging.info("estimating resolution from %d random raw spectra..." % n_spectra)
        self.resolution_func = resolution_estimate(self.raw, n_spectra)
        logging.info("resolution is %d @ 200" % round(self.resolution_func(200)))

        self.mz_range = self.imzml.get_mz_range()
        logging.info("m/z range: %f .. %f" % self.mz_range)

        logging.info("generating isotope patterns...")
        self.patterns = generate_patterns(self.formulas_fn, self.resolution_func, self.mz_range)

        logging.info("computing mean spectrum...")
        mzs, self.mean_intensities = read_mean_spectrum(self.raw)

        logging.info("computing mean spectrum from centroided data...")
        self.mzs, self.intensities, self.frequencies = generate_summary_spectrum3(mzs, self.imzml)

        self.n = 5
Ejemplo n.º 26
0
def import_spectra(filepath, spectra_format="imzml"):
    ############### IMZML
    if spectra_format == "imzml" or spectra_format == "imzML":
        ##### Import the libraries
        install_required_packages("pyimzml")
        from pyimzml.ImzMLParser import ImzMLParser
        ##### Parse the imzML file
        parsed_imzml = ImzMLParser(filepath)
        ##### Generate the list of spectra
        spectra = []
        for i,(x,y) in enumerate(parsed_imzml.coordinates):
            spectra.append(parsed_imzml.getspectrum(i))
    ############### XMASS
    elif spectra_format == "brukerflex" or spectra_format == "xmass" or spectra_format == "Xmass":
        pass
    ############### Return the list of spectra
    return (spectra)
Ejemplo n.º 27
0
 def collect_metadata(self):
     print('parsing imzML from %s' % self.path)
     with ImzMLParser(self.path) as parser:
         md = parser.imzmldict
     md = {k: (int(v) if type(v) == np.int64 else v) for k, v in md.items()}
     #         for k, v in md.items():
     #             print(k, v, type(v))
     return md
def load_and_split_ds_vm(storage, imzml_cobject, ibd_cobject, ds_segm_size_mb,
                         sort_memory):
    stats = []

    with TemporaryDirectory() as tmp_dir:
        logger.info("Temp dir is {}".format(tmp_dir))
        imzml_dir = Path(tmp_dir) / 'imzml'
        res = imzml_dir.mkdir()
        logger.info("Create {} result {}".format(imzml_dir, res))
        segments_dir = Path(tmp_dir) / 'segments'
        res = segments_dir.mkdir()
        logger.info("Create {} result {}".format(segments_dir, res))

        logger.info('Downloading dataset...')
        t = time()
        imzml_path, ibd_path = download_dataset(imzml_cobject, ibd_cobject,
                                                imzml_dir, storage)
        stats.append(('download_dataset', time() - t))

        logger.info('Loading parser...')
        t = time()
        imzml_parser = ImzMLParser(str(imzml_path))
        imzml_reader = imzml_parser.portable_spectrum_reader()
        stats.append(('load_parser', time() - t))

        logger.info('Defining segments bounds...')
        t = time()
        ds_segments_bounds = define_ds_segments(
            imzml_parser, ds_segm_size_mb=ds_segm_size_mb)
        segments_n = len(ds_segments_bounds)
        stats.append(('define_segments', time() - t))

        logger.info('Segmenting...')
        t = time()
        chunks_n, ds_segms_len = make_segments(imzml_reader, ibd_path,
                                               ds_segments_bounds,
                                               segments_dir, sort_memory)
        stats.append(('dataset_segmentation', time() - t))

        logger.info('Uploading segments...')
        t = time()
        ds_segms_cobjects = upload_segments(storage, segments_dir, chunks_n,
                                            segments_n)
        stats.append(('upload_segments', time() - t))

        return imzml_reader, ds_segments_bounds, ds_segms_cobjects, ds_segms_len, stats
Ejemplo n.º 29
0
    def test_parser_init_paths(data_path, parse_lib):
        parser = ImzMLParser(data_path, parse_lib=parse_lib)
        assert len(parser.coordinates) == 9
        assert parser.n_pixels == 9

        mz_x, mz_y = parser.get_spectrum(0)
        assert len(mz_x) == len(mz_y)
        assert len(mz_x) > 0
        assert len(mz_y) > 0

        mz_x, mz_y = parser.get_spectrum(4)
        assert len(mz_x) == len(mz_y)
        assert len(mz_x) == 8399
        assert len(mz_y) == 8399
        assert np.all(mz_x > 100.0)
        assert np.all(mz_x < 800.0)
        assert np.all(mz_y >= 0.0)
        assert np.all(mz_y < 3.0)
Ejemplo n.º 30
0
    def test_parser_init_paths_as_with(data_path, parse_lib):
        with ImzMLParser(data_path, parse_lib=parse_lib) as parser:
            assert len(parser.coordinates) == 9
            assert parser.n_pixels == 9

            mz_x, mz_y = parser.get_spectrum(0)
            assert len(mz_x) == len(mz_y)
            assert len(mz_x) > 0
            assert len(mz_y) > 0
Ejemplo n.º 31
0
 def run(self):
     from pyimzml.ImzMLParser import ImzMLParser
     import json
     p = ImzMLParser(self.imzml_filename)
     im = {}
     for im_type in self.im_types:
         im[im_type] = np.zeros((p.imzmldict["max count of pixels y"],
                                 p.imzmldict["max count of pixels x"]))
     for i, (x, y, z_) in enumerate(p.coordinates):
         mzs, ints = p.getspectrum(i)
         for im_type in self.im_types:
             im[im_type][y - 1, x - 1] = getattr(np, im_type)(ints)
     for ii, im_type in enumerate(self.im_types):
         result = {
             'im_vect': [_mz for _mz in im[im_type].flatten()],
             'im_shape': np.shape(im[im_type])
         }
         with open(self.output()[ii].path, 'w+') as f:
             json.dump(result, f)
Ejemplo n.º 32
0
def test_browse(data_path, parse_lib, item_ids):
    parser = ImzMLParser(data_path, parse_lib=parse_lib)
    browser = browse(parser)
    assert browser

    all_item_ids = set()
    for i in range(parser.n_pixels):
        all_item_ids.update(browser.for_spectrum(i).get_ids(item_ids))

    assert len(all_item_ids) != 0
Ejemplo n.º 33
0
    def test_parser_init_ibd_as_filename(imzml_path, ibd_path, parse_lib):
        with ImzMLParser(imzml_path, parse_lib=parse_lib,
                         ibd_file=ibd_path) as parser:
            assert len(parser.coordinates) == 9
            assert parser.n_pixels == 9

            mz_x, mz_y = parser.get_spectrum(0)
            assert len(mz_x) == len(mz_y)
            assert len(mz_x) > 0
            assert len(mz_y) > 0
Ejemplo n.º 34
0
    def spectrum_iter(self):
        """
        Generator function that yields a position and associated spectrum for a selected datacube type.
        :yield: (xidx, yidx) a tuple of ints representing x and y position in the image
        :yield: yi,          a numpy 1D-array of floats containing spectral intensities at the given position
                                and for the selected datacube type
        """
        reader = ImzMLParser(self.basename)
        for idx in xrange(0, len(reader.coordinates)):
            xidx, yidx, zidx = reader.coordinates[idx]
            # Coordinates may start at arbitrary locations, hence, we need to substract the minimum to recenter at (0,0)
            xidx -= self.x_pos_min
            yidx -= self.y_pos_min
            mz, intens = reader.getspectrum(idx)
            # Rehistogram the data if we are in procesed mode
            if self.imzml_type == self.available_imzml_types['processed']:
                # shift = np.diff(self.mz).mean()
                # bin_edges = np.append(self.mz, self.mz[-1]+ shift)
                f = interpolate.interp1d(mz,intens,fill_value=0,bounds_error=False)
                intens = f(self.mz)
                # intens, bin_edges_new = np.histogram(mz, bins=bin_edges, weights=intens)

            yield (xidx, yidx), np.asarray(intens)
Ejemplo n.º 35
0
def main(argv):
    from pyimzml.ImzMLParser import ImzMLParser
    inputfile = ''
    outputfile = ''
    try:
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
    except getopt.GetoptError:
        print('test.py -i <inputfile> -o <outputfile>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -i <inputfile> -o <outputfile>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
    if inputfile == '':
        print('test.py -i <inputfile> -o <outputfile>')
        raise IOError('input file not specified')
    if outputfile=='':
        outputfile=inputfile+'.imzML'
    imzml = ImzMLParser(inputfile)
    spectra = []
    with ImzMLWriter(outputfile, mz_dtype=np.float32, intensity_dtype=np.float32) as writer:
        for i, coords in enumerate(imzml.coordinates):
            mzs, intensities = imzml.getspectrum(i)
            writer.addSpectrum(mzs, intensities, coords)
            spectra.append((mzs, intensities, coords))

    imzml = ImzMLParser(outputfile)
    spectra2 = []
    for i, coords in enumerate(imzml.coordinates):
        mzs, intensities = imzml.getspectrum(i)
        spectra2.append((mzs, intensities, coords))

    print(spectra[0] == spectra2[0])
Ejemplo n.º 36
0
 def __init__(self, nmf_fn, layers_fn, imzml_fn):
     self._imzml = ImzMLParser(imzml_fn)
     with np.load(nmf_fn) as data:
         nx, ny = data['shape']
         self._W = data['W'].reshape((nx, ny, -1))
         self._H = data['H']
         self._mz_axis = data['mz_axis']
     self._norm_real = {}
     self._norm_simulated = {}
     self._norm_groundtruth = {}
     self._norm_noise = {}
     self._norm_diff = {}
     self._coords = {}
     for i, coords in enumerate(self._imzml.coordinates):
         self._coords[(coords[0], coords[1])] = i
     self._mz_bins = []
     for mz, ppm in self._mz_axis:
         self._mz_bins.append(mz * (1.0 + 1e-6 * ppm))
Ejemplo n.º 37
0
class inMemoryIMS():
    def __init__(self, filename, min_mz=0., max_mz=np.inf, min_int=0., index_range=[],cache_spectra=True,do_summary=True,norm=''):
        file_size = os.path.getsize(filename)
        self.load_file(filename, min_mz, max_mz, min_int, index_range=index_range,cache_spectra=cache_spectra,do_summary=do_summary,norm=norm)

    def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[],cache_spectra=True,do_summary=True,norm=[]):
        # parse file to get required parameters
        # can use thin hdf5 wrapper for getting data from file
        self.file_dir, self.filename = os.path.split(filename)
        self.filename, self.file_type = os.path.splitext(self.filename)
        self.file_type = self.file_type.lower()
        self.norm=norm
        if self.file_type == '.hdf5':
            import h5py
            self.hdf = h5py.File(filename, 'r')  # Readonly, fie must exist
            if index_range == []:
                self.index_list = map(int, self.hdf['/spectral_data'].keys())
            else:
                self.index_list = index_range
        elif self.file_type == '.imzml':
            from pyimzml.ImzMLParser import ImzMLParser
            self.imzml = ImzMLParser(filename)
            self.index_list=range(0,len(self.imzml.coordinates))
        else:
            raise TypeError('File type not recogised: {}'.format(self.file_type))
        self.max_index = max(self.index_list)
        self.coords = self.get_coords()
        step_size = self.get_step_size()
        cube = ion_datacube(step_size=step_size)
        cube.add_coords(self.coords)
        self.cube_pixel_indices = cube.pixel_indices
        self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns
        self.histogram_mz_axis = {}
        self.mz_min = 9999999999999.
        self.mz_max = 0.
        if any([cache_spectra,do_summary]) == True:
            # load data into memory
            self.mz_list = []
            self.count_list = []
            self.idx_list = []
            if do_summary:
                self.mic=np.zeros((len(self.index_list),1))
                self.tic=np.zeros((len(self.index_list),1))
            for ii in self.index_list:
                # load spectrum, keep values gt0 (shouldn't be here anyway)
                this_spectrum = self.get_spectrum(ii)
                mzs, counts = this_spectrum.get_spectrum(source='centroids')
                if len(mzs) != len(counts):
                    raise TypeError('length of mzs ({}) not equal to counts ({})'.format(len(mzs), len(counts)))
                # Enforce data limits
                valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int))
                counts = counts[valid]
                mzs = mzs[valid]
                # record min/max
                if mzs[0]<self.mz_min:
                    self.mz_min = mzs[0]
                if mzs[-1]>self.mz_max:
                    self.mz_max = mzs[-1]
                # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded)
                if cache_spectra:
                    self.mz_list.append(mzs)
                    self.count_list.append(counts)
                    self.idx_list.append(np.ones(len(mzs), dtype=int) * ii)
                #record summary values
                if do_summary:
                    self.tic[ii]=sum(counts)
                    self.mic[ii]=max(counts)
            print 'loaded spectra'
            if cache_spectra:
                self.mz_list = np.concatenate(self.mz_list)
                self.count_list = np.concatenate(self.count_list)
                self.idx_list = np.concatenate(self.idx_list)
                # sort by mz for fast image formation
                mz_order = np.argsort(self.mz_list)
                self.mz_list = self.mz_list[mz_order]
                self.count_list = self.count_list[mz_order]
                self.idx_list = self.idx_list[mz_order]
                # split binary searches into two stages for better locality
                self.window_size = 1024
                self.mz_sublist = self.mz_list[::self.window_size].copy()
        print 'file loaded'


    def get_step_size(self):
        if self.file_type == '.imzml':
            return [1,1,1]
        else:
            return []


    def get_coords(self):
        # wrapper for redirecting requests to correct parser
        if self.file_type == '.imzml':
            coords = self.get_coords_imzml()
            coords[:,[0, 1]] = coords[:,[1, 0]]
        elif self.file_type == '.hdf5':
            coords = self.get_coords_hdf5()
        return coords


    def get_coords_imzml(self):# get real world coordinates
        print('TODO: convert indices into real world coordinates')
        coords = np.asarray(self.imzml.coordinates)
        if len(self.imzml.coordinates[0]) == 2: #2D - append zero z-coord
            coords = np.concatenate((coords,np.zeros((len(coords),1))),axis=1)
        return coords


    def get_coords_hdf5(self):
        coords = np.zeros((len(self.index_list), 3))
        for k in self.index_list:
            coords[k, :] = self.hdf['/spectral_data/' + str(k) + '/coordinates/']
        return coords


    def get_spectrum(self,index):
        # wrapper for redirecting requests to correct parser
        if self.file_type == '.imzml':
            this_spectrum = self.get_spectrum_imzml(index)
        elif self.file_type == '.hdf5':
            this_spectrum = self.get_spectrum_hdf5(index)
        if self.norm != []:
            mzs,counts = this_spectrum.get_spectrum(source="centroids")
            if self.norm == 'TIC':
                counts = counts / np.sum(counts)
            elif self.norm == 'RMS':
                counts = counts / np.sqrt(np.mean(np.square(counts)))
            elif self.norm == 'MAD':
                counts = counts/np.median(np.absolute(counts - np.mean(counts)))
            this_spectrum.add_centroids(mzs,counts)
        return this_spectrum


    def get_spectrum_imzml(self,index):
        mzs, intensities = self.imzml.getspectrum(index)
        ## temp hack -> assume centroided
        this_spectrum = mass_spectrum()
        this_spectrum.add_centroids(mzs,intensities)
        return this_spectrum

    def get_spectrum_hdf5(self, index):
        import h5py
        this_spectrum = mass_spectrum()
        tmp_str = '/spectral_data/%d' % (index)
        try:
            this_spectrum.add_spectrum(self.hdf[tmp_str + '/mzs/'], self.hdf[tmp_str + '/intensities/'])
            got_spectrum = True
        except KeyError:
            got_spectrum = False
        try:
            this_spectrum.add_centroids(self.hdf[tmp_str + '/centroid_mzs/'],
                                        self.hdf[tmp_str + '/centroid_intensities/'])
            got_centroids = True
        except KeyError:
            got_centroids = False
        if not any([got_spectrum, got_centroids]):
            raise ValueError('No spectral data found in index {}'.format(index))
        return this_spectrum

    def empty_datacube(self):
        data_out = ion_datacube()
        # add precomputed pixel indices
        data_out.coords = self.coords
        data_out.pixel_indices = self.cube_pixel_indices
        data_out.nRows = self.cube_n_row
        data_out.nColumns = self.cube_n_col
        return data_out

    def get_ion_image(self, mzs, tols, tol_type='ppm'):
        data_out = self.empty_datacube()

        def search_sort(mzs,tols):
            data_out = blank_dataout()
            idx_left = np.searchsorted(self.mz_list, mzs - tols, 'l')
            idx_right = np.searchsorted(self.mz_list, mzs + tols, 'r')
            for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right):
                if any((mz<self.mz_list[0],mz>self.mz_list[-1])):
                    data_out.add_xic(np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol])
                    continue
                # slice list for code clarity
                mz_vect=self.mz_list[il:ir]
                idx_vect = self.idx_list[il:ir]
                count_vect = self.count_list[il:ir]
                # bin vectors
                ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1)
                data_out.add_xic(ion_vect, [mz], [tol])
            return data_out
        def search_bisect(mzs,tols):
            data_out = blank_dataout()
            for mz,tol in zip(mzs,tols):
                if any((mz<self.mz_list[0],mz>self.mz_list[-1])):
                    data_out.add_xic(np.zeros(np.shape(self.cube_pixel_indices)), [mz], [tol])
                    continue
                mz_upper = mz + tol
                mz_lower = mz - tol
                il = bisect.bisect_left(self.mz_list,mz_lower)
                ir = bisect.bisect_right(self.mz_list,mz_upper)
                # slice list for code clarity
                mz_vect=self.mz_list[il:ir]
                idx_vect = self.idx_list[il:ir]
                count_vect = self.count_list[il:ir]
                # bin vectors
                ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1)
                data_out.add_xic(ion_vect, [mz], [tol])
            return data_out
        if type(mzs) not in (np.ndarray, list):
            mzs = np.asarray([mzs, ])
        if tol_type == 'ppm':
            tols = tols * mzs / 1e6  # to m/z

        # Fast search for insertion point of mz in self.mz_list
        # First stage is looking for windows using the sublist
        idx_left = np.searchsorted(self.mz_sublist, mzs - tols, 'l')
        idx_right = np.searchsorted(self.mz_sublist, mzs + tols, 'r')
        for mz, tol, il, ir in zip(mzs, tols, idx_left, idx_right):
            l = max(il - 1, 0) * self.window_size
            r = ir * self.window_size
            # Second stage is binary search within the windows
            il = l + np.searchsorted(self.mz_list[l:r], mz - tol, 'l')
            ir = l + np.searchsorted(self.mz_list[l:r], mz + tol, 'r')
            # slice list for code clarity
            mz_vect=self.mz_list[il:ir]
            idx_vect = self.idx_list[il:ir]
            count_vect = self.count_list[il:ir]
            # bin vectors
            ion_vect = np.bincount(idx_vect, weights=count_vect, minlength=self.max_index + 1)
            data_out.add_xic(ion_vect, [mz], [tol])
        return data_out
        # Form histogram axis

    def generate_histogram_axis(self, ppm=1.):
        ppm_mult = ppm * 1e-6
        mz_current = self.mz_min
        mz_list = [mz_current,]
        while mz_current <= self.mz_max:
            mz_current = mz_current + mz_current * ppm_mult
            mz_list.append(mz_current)
        self.histogram_mz_axis[ppm] = mz_list

    def get_histogram_axis(self, ppm=1.):
        try:
            mz_axis = self.histogram_mz_axis[ppm]
        except KeyError as e:
            print 'generating histogram axis for ppm {}'.format(ppm)
            self.generate_histogram_axis(ppm=ppm)
        return self.histogram_mz_axis[ppm]

    def generate_summary_spectrum(self, summary_type='mean', ppm=1.):
        hist_axis = self.get_histogram_axis(ppm=ppm)
        # calcualte mean along some m/z axis
        mean_spec = np.zeros(np.shape(hist_axis))
        for ii in range(0, len(hist_axis) - 1):
            mz_upper = hist_axis[ii + 1]
            mz_lower = hist_axis[ii]
            idx_left = bisect.bisect_left(self.mz_list, mz_lower)
            idx_right = bisect.bisect_right(self.mz_list, mz_upper)
            # slice list for code clarity
            count_vect = self.count_list[idx_left:idx_right]
            if summary_type == 'mean':
                count_vect = self.count_list[idx_left:idx_right]
                mean_spec[ii] = np.sum(count_vect)
            elif summary_type == 'freq':
                idx_vect = self.idx_list[idx_left:idx_right]
                mean_spec[ii] = float(len(np.unique(idx_vect)))
            else:
                raise ValueError('Summary type not recognised; {}'.format(summary_type))
        if summary_type == 'mean':
            mean_spec = mean_spec / len(self.index_list)
        elif summary_type == 'freq':
            mean_spec = mean_spec / len(self.index_list)
        return hist_axis, mean_spec

    def get_summary_image(self,summary_func='tic'):
        if summary_func not in ['tic','mic']: raise KeyError("requested type not in 'tic' mic'")
        data_out = ion_datacube()
        # add precomputed pixel indices
        data_out.coords = self.coords
        data_out.pixel_indices = self.cube_pixel_indices
        data_out.nRows = self.cube_n_row
        data_out.nColumns = self.cube_n_col
        data_out.add_xic(np.asarray(getattr(self, summary_func))[self.index_list], [0], [0])
        return data_out
Ejemplo n.º 38
0
 def load_file(self, filename, min_mz=0, max_mz=np.inf, min_int=0, index_range=[],cache_spectra=True,do_summary=True,norm=[]):
     # parse file to get required parameters
     # can use thin hdf5 wrapper for getting data from file
     self.file_dir, self.filename = os.path.split(filename)
     self.filename, self.file_type = os.path.splitext(self.filename)
     self.file_type = self.file_type.lower()
     self.norm=norm
     if self.file_type == '.hdf5':
         import h5py
         self.hdf = h5py.File(filename, 'r')  # Readonly, fie must exist
         if index_range == []:
             self.index_list = map(int, self.hdf['/spectral_data'].keys())
         else:
             self.index_list = index_range
     elif self.file_type == '.imzml':
         from pyimzml.ImzMLParser import ImzMLParser
         self.imzml = ImzMLParser(filename)
         self.index_list=range(0,len(self.imzml.coordinates))
     else:
         raise TypeError('File type not recogised: {}'.format(self.file_type))
     self.max_index = max(self.index_list)
     self.coords = self.get_coords()
     step_size = self.get_step_size()
     cube = ion_datacube(step_size=step_size)
     cube.add_coords(self.coords)
     self.cube_pixel_indices = cube.pixel_indices
     self.cube_n_row, self.cube_n_col = cube.nRows, cube.nColumns
     self.histogram_mz_axis = {}
     self.mz_min = 9999999999999.
     self.mz_max = 0.
     if any([cache_spectra,do_summary]) == True:
         # load data into memory
         self.mz_list = []
         self.count_list = []
         self.idx_list = []
         if do_summary:
             self.mic=np.zeros((len(self.index_list),1))
             self.tic=np.zeros((len(self.index_list),1))
         for ii in self.index_list:
             # load spectrum, keep values gt0 (shouldn't be here anyway)
             this_spectrum = self.get_spectrum(ii)
             mzs, counts = this_spectrum.get_spectrum(source='centroids')
             if len(mzs) != len(counts):
                 raise TypeError('length of mzs ({}) not equal to counts ({})'.format(len(mzs), len(counts)))
             # Enforce data limits
             valid = np.where((mzs > min_mz) & (mzs < max_mz) & (counts > min_int))
             counts = counts[valid]
             mzs = mzs[valid]
             # record min/max
             if mzs[0]<self.mz_min:
                 self.mz_min = mzs[0]
             if mzs[-1]>self.mz_max:
                 self.mz_max = mzs[-1]
             # append ever-growing lists (should probably be preallocated or piped to disk and re-loaded)
             if cache_spectra:
                 self.mz_list.append(mzs)
                 self.count_list.append(counts)
                 self.idx_list.append(np.ones(len(mzs), dtype=int) * ii)
             #record summary values
             if do_summary:
                 self.tic[ii]=sum(counts)
                 self.mic[ii]=max(counts)
         print 'loaded spectra'
         if cache_spectra:
             self.mz_list = np.concatenate(self.mz_list)
             self.count_list = np.concatenate(self.count_list)
             self.idx_list = np.concatenate(self.idx_list)
             # sort by mz for fast image formation
             mz_order = np.argsort(self.mz_list)
             self.mz_list = self.mz_list[mz_order]
             self.count_list = self.count_list[mz_order]
             self.idx_list = self.idx_list[mz_order]
             # split binary searches into two stages for better locality
             self.window_size = 1024
             self.mz_sublist = self.mz_list[::self.window_size].copy()
     print 'file loaded'
class CleanImageSearch(object):
    def __init__(self, imzml_filename, raw_data_filename, formulas_filename):
        self.imzml = ImzMLParser(imzml_filename)
        self.formulas_fn = formulas_filename
        if raw_data_filename.endswith(".h5"):
            self.raw = h5py.File(raw_data_filename)
        elif raw_data_filename.endswith(".RAW"):
            self.raw = str(raw_data_filename)
        else:
            raise ValueError("only .h5 and .RAW are supported")

        n_spectra = 25
        logging.info("estimating resolution from %d random raw spectra..." % n_spectra)
        self.resolution_func = resolution_estimate(self.raw, n_spectra)
        logging.info("resolution is %d @ 200" % round(self.resolution_func(200)))

        self.mz_range = self.imzml.get_mz_range()
        logging.info("m/z range: %f .. %f" % self.mz_range)

        logging.info("generating isotope patterns...")
        self.patterns = generate_patterns(self.formulas_fn, self.resolution_func, self.mz_range)

        logging.info("computing mean spectrum...")
        mzs, self.mean_intensities = read_mean_spectrum(self.raw)

        logging.info("computing mean spectrum from centroided data...")
        self.mzs, self.intensities, self.frequencies = generate_summary_spectrum3(mzs, self.imzml)

        self.n = 5

    def find_good_matches(self,
            min_peaks=3, min_intensity_share=0.99, min_iso_corr=0.95):
        result = find_clean_molecules(self.mzs, self.intensities, self.patterns,
                                      min_peaks=min_peaks,
                                      min_intensity_share=min_intensity_share,
                                      min_iso_corr=min_iso_corr)
        molecules = sorted(result, key=lambda k:self.patterns[k][0][0])

        matches = []
        for f, a in molecules:
            match = SpectralMatch(f, a, self.patterns, self.mzs, self.intensities, self.resolution_func)
            if len(match.theor_mzs) < self.n:
                continue
            matches.append(match)
        return sorted(matches, key = lambda m: m.theor_mzs[0])

    def _sf2m(self, formulas):
        if len(formulas) == 0:
            return []
        if isinstance(formulas[0], tuple) and isinstance(formulas[0][0], str):
            return [SpectralMatch(f, a, self.patterns, self.mzs, self.intensities, self.resolution_func)\
                    for f, a in formulas]
        else:
            return formulas

    def extract_images(self, formulas, n_bins=15):
        formulas = self._sf2m(formulas)
        raw_images, nrow, ncol = _get_images(formulas, self.imzml, self.n, n_bins)
        offset = 0
        images = []
        for m in formulas:
            #l = min(self.n, m.peak_count)
            images.append(MolecularImage(raw_images[offset : offset + self.n, :],
                                         nrow, ncol,
                                         m.formula, m.adduct, self.patterns))
            offset += self.n
        return images

    def extract_figures(self, formulas, min_img_corr=0.7, n_bins=15, **kwargs):
        formulas = self._sf2m(formulas)
        images, nrow, ncol = _get_images(formulas, self.imzml, self.n, n_bins)
        figures = []
        offset = 0
        for m in formulas:
            ims = m.image_correlation(images[offset : offset + self.n, :])
            if ims < min_img_corr:
                offset += self.n
                continue
            img = MolecularFigure(self.n, m, images[offset : offset + self.n, :], nrow, ncol,
                    self.mzs, self.intensities, self.mean_intensities, self.frequencies, **kwargs)
            figures.append(img)
            offset += self.n
        return figures
Ejemplo n.º 40
0
    def __compute_file_info(cls, filename, resolution):
        ## TODO completely refactor this to make it smartly handle profile or centroid datasets
        ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution
        ## TODO: profile datasets should work as is
        ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array']
        """
        Internal helper function used to compute the mz axis, data type for the intensities, format type

        :return: Numpy array with mz axis
        :return: string with data type
        :return: imzml file type
        :return:
        """
        reader = ImzMLParser(filename)
        # Read the first spectrum
        mz_axes, intens = reader.getspectrum(0)   # NOTE: mz_axes is a tuple
        # Read the coordinates
        coordinates = np.asarray(reader.coordinates)

        # #Start the data at [0,0,0]
        # coordinates[:,0] = coordinates[:,0] - np.amin(coordinates,axis=0)[0]
        # coordinates[:,1] = coordinates[:,1] - np.amin(coordinates,axis=0)[1]
        # coordinates[:,2] = coordinates[:,2] - np.amin(coordinates,axis=0)[2]

        # Determine the data type for the internsity values
        dtype = np.asarray(intens).dtype.str

        # Compute the mz axis and file type
        file_type = cls.available_imzml_types['continuous']
        min_mz, max_mz = np.amin(mz_axes), np.amax(mz_axes)
        for ind in range(coordinates.shape[0]):      #for ind, loc in enumerate(reader.coordinates):
            mz, intens = reader.getspectrum(ind)
            if mz == mz_axes:
                pass
            else:
                file_type = cls.available_imzml_types['processed']
                if min_mz > np.amin(mz):
                    min_mz = np.amin(mz)
                if max_mz < np.amax(mz):
                    max_mz = np.amax(mz)
        # Reinterpolate the mz-axis if we have a processed mode imzml file
        if file_type == cls.available_imzml_types['processed']:
            f = np.ceil(1e6 * np.log(max_mz/min_mz)/resolution)
            mz_axes = np.logspace(np.log10(min_mz), np.log10(max_mz), f)
            log_helper.info(__name__, "Reinterpolated m/z axis for processed imzML file")

        # Construct the imzml metadata information
        dataset_metadata = metadata_dict()
        instrument_metadata = metadata_dict()
        method_metadata = metadata_dict()
        for k, v in reader.imzmldict.iteritems():
            dataset_metadata[k] = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=k,
                                                 ontology=None)

        # Delete the parser and read the metadata
        del reader

        # Parse the metadata for the file. We try to parse only the header and ignore the
        # <run > group in the XML file to avoid going throught the whole file again
        # while extracting the majority of the relevant metadata
        try:
            with open(filename, 'r') as ins:
                metdata_header = ''
                for line in ins:
                    if '<run' in line:
                        break
                    else:
                        metdata_header += line
                metdata_header += '</mzML>'
                metdata_header_dict = xmltodict.parse(metdata_header)['mzML']
                for k, v in metdata_header_dict.iteritems():
                    store_value = metadata_value(name=k,
                                                 value=v,
                                                 unit=None,
                                                 description=str(k) + " extracted from imzML XML header.",
                                                 ontology=None)
                    if k == 'instrumentConfigurationList':
                        instrument_metadata[k] = store_value
                    elif k == 'dataProcessingList':
                        method_metadata[k] = store_value
                    elif k == 'scanSettingsList':
                        dataset_metadata[k] = store_value
                    elif k == 'softwareList':
                        method_metadata[k] = store_value
                    elif k =='sampleList':
                        method_metadata[k] = store_value
                    else:
                        dataset_metadata[k] = store_value
                dataset_metadata['imzml_xml_metadata_header'] = metadata_value(name='imzml_xml_metadata_header',
                                                                               value=metdata_header,
                                                                               unit=None,
                                                                               description='XML imzML header',
                                                                               ontology=None)
        except:
            log_helper.warning(__name__, "Extraction of additional imzML metadata failed")

        return coordinates, np.asarray(mz_axes), dtype, file_type, dataset_metadata, instrument_metadata, method_metadata
Ejemplo n.º 41
0
class NoiseGenerator(object):
    def __init__(self, nmf_fn, layers_fn, imzml_fn):
        self._imzml = ImzMLParser(imzml_fn)
        with np.load(nmf_fn) as data:
            nx, ny = data['shape']
            self._W = data['W'].reshape((nx, ny, -1))
            self._H = data['H']
            self._mz_axis = data['mz_axis']
        self._norm_real = {}
        self._norm_simulated = {}
        self._norm_groundtruth = {}
        self._norm_noise = {}
        self._norm_diff = {}
        self._coords = {}
        for i, coords in enumerate(self._imzml.coordinates):
            self._coords[(coords[0], coords[1])] = i
        self._mz_bins = []
        for mz, ppm in self._mz_axis:
            self._mz_bins.append(mz * (1.0 + 1e-6 * ppm))

        # self._removeAssignedBins(layers_fn)

    def _removeAssignedBins(self, layers_fn):
        # buggy at the moment
        with open(layers_fn, 'rb') as f:
            layers = cPickle.load(f)
        for i in layers['layers_list']:
            assigned = layers['layers_list'][i]['assigned_mz_bins']
            assigned = assigned[assigned < self._H[i].shape[0]]
            print "#assigned bins in component #{}: {}".format(i + 1, len(assigned))
            h = np.zeros_like(self._H[i])
            h[assigned] = self._H[i][assigned]
            self._H[i] = h

    def _getRealSpectrum(self, x, y):
        return self._imzml.getspectrum(self._coords[(x, y)])

    def _norm(self, intensities):
        return np.linalg.norm(intensities)

    def generateNoise(self, x, y):
        real_spectrum = self._getRealSpectrum(x, y)
        real_mzs, real_intensities = map(np.array, real_spectrum)

        min_mz, max_mz = self._mz_bins[0], self._mz_bins[-1]
        inside_range = (real_mzs >= min_mz) & (real_mzs <= max_mz)
        real_mzs = real_mzs[inside_range]
        real_intensities = real_intensities[inside_range]

        bins = np.digitize(real_mzs, self._mz_bins)
        n_bins = len(self._mz_bins)
        binned_real_intensities = np.bincount(bins, real_intensities, n_bins)
        self._norm_real[(x, y)] = self._norm(binned_real_intensities)
        binned_approx_intensities = self._W[x, y, :].dot(self._H)
        noise = np.abs(binned_real_intensities - binned_approx_intensities)
        # FIXME: avoid duplicating noise
        noise_intensities = noise[bins] * args.inflate_noise
        noise_mzs = np.array(real_mzs)
        nnz = noise_intensities > min(real_intensities) / 2
        return noise_mzs[nnz], noise_intensities[nnz]

    def addNoise(self, profile_spectrum, coords):
        spec = map(np.array, profile_spectrum)
        p = centroidize(*spec)
        mzs = np.array(p.masses)
        mult = spec[1].max() if len(spec[1]) > 0 else 1
        intensities = np.array(p.abundances) * mult

        x, y = coords[:2]
        limit = min(self._getRealSpectrum(*coords)[1])

        noise_mzs, noise_intensities = self.generateNoise(*coords)
        self._norm_noise[(x, y)] = self._norm(noise_intensities[noise_intensities > limit])
        self._norm_groundtruth[(x, y)] = self._norm(intensities[intensities > limit])
        self._norm_simulated[(x, y)] = self._norm_noise[(x, y)] + self._norm_groundtruth[(x, y)]
        self._norm_diff[(x, y)] = abs(self._norm_simulated[(x, y)] - self._norm_real[(x, y)])
        mzs = np.concatenate([mzs, noise_mzs])
        intensities = np.concatenate([intensities, noise_intensities])

        detectable = np.where(intensities > limit)[0]
        mzs = mzs[detectable]
        intensities = intensities[detectable]

        order = mzs.argsort()
        return mzs[order], intensities[order]

    def saveStatistics(self, filename):
        def toRect(d):
            xs = [k[0] for k in d]
            ys = [k[1] for k in d]
            img = np.zeros((max(xs) + 1, max(ys) + 1))
            for k in d:
                img[k[0], k[1]] = d[k]
            return img

        with open(filename, "w+") as f:
            np.savez(f,
                     real=toRect(self._norm_real),
                     simulated=toRect(self._norm_simulated),
                     groundtruth=toRect(self._norm_groundtruth),
                     noise=toRect(self._norm_noise),
                     diff=toRect(self._norm_diff))
Ejemplo n.º 42
0
        order = mzs.argsort()
        return mzs[order], intensities[order]

    def saveStatistics(self, filename):
        def toRect(d):
            xs = [k[0] for k in d]
            ys = [k[1] for k in d]
            img = np.zeros((max(xs) + 1, max(ys) + 1))
            for k in d:
                img[k[0], k[1]] = d[k]
            return img

        with open(filename, "w+") as f:
            np.savez(f,
                     real=toRect(self._norm_real),
                     simulated=toRect(self._norm_simulated),
                     groundtruth=toRect(self._norm_groundtruth),
                     noise=toRect(self._norm_noise),
                     diff=toRect(self._norm_diff))

ng = NoiseGenerator(args.nmf, args.layers, args.real)

imzml_sim = ImzMLParser(args.simclean)

with ImzMLWriter(args.output, mz_dtype=np.float32) as w:
    for i, coords in enumerate(imzml_sim.coordinates):
        noisy_mzs, noisy_intensities = ng.addNoise(imzml_sim.getspectrum(i), coords)
        w.addSpectrum(noisy_mzs, noisy_intensities, coords)

ng.saveStatistics(args.output + ".norms")