Beispiel #1
0
    def _search_string_identifier(self, search_string, chunk_size=8):
        with open(self.path, 'rb') as seeker:
            data = None
            total_chunk_size = chunk_size * 512
            spec_start = None

            # NOTE: This needs to go intp regex_patterns.py

            regex_string = re.compile(
                "<\s*spectrum[^>]*index=\"[0-9]+\"\sid=\"({0})\"\sdefaultArrayLength=\"[0-9]+\">"
                .format("".join(['.*', search_string, '.*'])).encode())

            search_string = search_string.encode()

            while True:
                file_pointer = seeker.tell()

                data = seeker.read(total_chunk_size)
                string, seeker = self._read_until_tag_end(seeker,
                                                          byte_mode=True)
                data += string
                spec_start = regex_string.search(data)
                if spec_start:
                    spec_start_offset = file_pointer + spec_start.start()
                    current_index = spec_start.group(1)
                    if search_string in current_index:
                        seeker.seek(spec_start_offset)
                        start, end = self._read_to_spec_end(seeker)
                        seeker.seek(start)
                        spec_string = seeker.read(end - start)
                        xml_string = XML(spec_string)
                        return spec.Spectrum(xml_string,
                                             measured_precision=5e-6)
                elif len(data) == 0:
                    raise Exception('cant find specified string')
Beispiel #2
0
    def __getitem__(self, identifier):
        """
        Access the item with id 'identifier' in the file by iterating the xml-tree.

        Arguments:
            identifier (str): native id of the item to access

        Returns:
            data (str): text associated with the given identifier
        """
        old_pos = self.file_handler.tell()
        self.file_handler.seek(0, 0)
        mzml_iter = iter(iterparse(self.file_handler, events=['end']))
        while True:
            event, element = next(mzml_iter)
            if event == 'end':
                if element.tag.endswith('}spectrum'):
                    if int(
                            regex_patterns.SPECTRUM_ID_PATTERN.search(
                                element.get('id')).group(0)) == identifier:
                        self.file_handler.seek(old_pos, 0)
                        return spec.Spectrum(element, measured_precision=5e-6)
                elif element.tag.endswith('}chromatogram'):
                    if element.get('id') == identifier:
                        self.file_handler.seek(old_pos, 0)
                        return spec.Chromatogram(element,
                                                 measured_precision=5e-6)
Beispiel #3
0
    def __next__(self):
        """
        Iterator for the class :py:class:`Run`.

        Iterates all of the spectra in the file.

        Returns:
            Spectrum (:py:class:`Spectrum`): a spectrum object with interface
                to the original spectrum element.

        Example:

        >>> for spectrum in Reader:
        ...     print(spectrum.mz, end='\\r')

        """
        has_ref_group = self.info.get('referenceable_param_group_list', False)
        while True:
            event, element = next(self.iter, ('END', 'END'))
            if event == 'end':
                if element.tag.endswith('}spectrum'):
                    spectrum = spec.Spectrum(element)
                    if has_ref_group:
                        spectrum._set_params_from_reference_group(
                            self.info['referenceable_param_group_list_element']
                        )
                    ms_level = spectrum.ms_level
                    spectrum.measured_precision = self.ms_precisions[ms_level]
                    spectrum.calling_instance = self
                    return spectrum
            elif event == 'END':
                raise StopIteration
Beispiel #4
0
    def __next__(self):
        """
        Iterator for the class :py:class:`Run`.

        Iterates all of the spectra in the file.

        Returns:
            Spectrum (:py:class:`Spectrum`): a spectrum object with interface
                to the original spectrum element.

        Example:

        >>> for spectrum in Reader:
        ...     print(spectrum.mz, end='\\r')

        """
        while True:
            event, element = next(self.iter, ('END', 'END'))
            if event == 'end':
                if element.tag.endswith('}spectrum'):
                    spectrum = spec.Spectrum(element)
                    ms_level = spectrum.ms_level
                    spectrum.measured_precision = self.ms_precisions[ms_level]
                    spectrum.calling_instance = self
                    return spectrum
            elif event == 'END':
                raise StopIteration
Beispiel #5
0
    def _search_linear(self, seeker, index, chunk_size=8):
        """
        Fallback to linear search if interpolated search fails.
        """
        data = None
        i = 0
        total_chunk_size = chunk_size * 512
        spec_start = None
        spec_end = None
        i = 0
        # print('target', index)
        while True:
            file_pointer = seeker.tell()

            data = seeker.read(total_chunk_size)
            string, seeker = self._read_until_tag_end(seeker)
            data += string

            spec_start = self.spec_open.search(data)
            if spec_start:
                spec_start_offset = file_pointer + spec_start.start()
                seeker.seek(spec_start_offset)
                current_index = int(
                    re.search(b'[0-9]*$', spec_start.group('id')).group())
                # print(current_index)
                spec_end = self.spec_close.search(data[spec_start.start():])
                if spec_end:
                    spec_end_offset = file_pointer + spec_end.end(
                    ) + spec_start.start()
                    seeker.seek(spec_end_offset)
                while spec_end is None:

                    file_pointer = seeker.tell()

                    data = seeker.read(total_chunk_size)
                    string, seeker = self._read_until_tag_end(seeker)
                    data += string

                    spec_end = self.spec_close.search(data)
                    if spec_end:
                        spec_end_offset = file_pointer + spec_end.end()
                        self.offset_dict[current_index] = (spec_start_offset,
                                                           spec_end_offset)
                        seeker.seek(spec_end_offset)
                        break

                if current_index == index:
                    seeker.seek(spec_start_offset)
                    spec_string = seeker.read(spec_end_offset -
                                              spec_start_offset)
                    self.offset_dict[current_index] = (spec_start_offset,
                                                       spec_end_offset)
                    xml_string = XML(spec_string)
                    seeker.close()
                    return spec.Spectrum(xml_string, measured_precision=5e-6)
Beispiel #6
0
    def __getitem__(self, identifier):
        """
        Access the item with id 'identifier'.

        Either use linear, binary or interpolated search.

        Arguments:
            identifier (str): native id of the item to access

        Returns:
            data (str): text associated with the given identifier
        """
        #############################################################################
        # DOES NOT HOLD IF NUMBERS DONT START WITH ONE AND/OR DONT INCREASE BY ONE  #
        # TODO FIXME                                                                #
        #############################################################################

        self.file_handler.seek(0)

        spectrum = None

        if str(identifier).upper() == 'TIC':
            # print(str(identifier).upper())
            found = False
            mzmliter = iter(iterparse(self.file_handler, events=['end']))
            while found is False:
                event, element = next(mzmliter, ('STOP', 'STOP'))
                if event == 'end':
                    if element.tag.endswith('}chromatogram'):
                        if element.get('id') == 'TIC':
                            found = True
                            spectrum = spec.Chromatogram(
                                element, measured_precision=5e-6)
                elif event == 'STOP':
                    raise StopIteration

        elif identifier in self.offset_dict:
            start = self.offset_dict[identifier]
            with open(self.path, 'rb') as seeker:
                seeker.seek(start[0])
                start, end = self._read_to_spec_end(seeker)
            self.file_handler.seek(start, 0)
            data = self.file_handler.read(end - start)
            spectrum = spec.Spectrum(XML(data), measured_precision=5e-6)
        elif type(identifier) == str:
            return self._search_string_identifier(identifier)
        else:
            spectrum = self._interpol_search(identifier)

        return spectrum
Beispiel #7
0
    def __getitem__(self, key):
        """
        Execute a SQL request, process the data and return a spectrum object.

        Args:
            key (str or int): unique identifier for the given spectrum in the
            database
        """
        self.cursor.execute("SELECT * FROM spectra WHERE id=?", key)
        ID, element = self.cursor.fetchone()

        element = et.XML(element)
        if "spectrum" in element.tag:
            spectrum = spec.Spectrum(element)
        elif "chromatogram" in element.tag:
            spectrum = spec.Chromatogram(element)
        return spectrum
Beispiel #8
0
    def __getitem__(self, identifier):
        """
        Access the item with id 'identifier' in the file.

        Arguments:
            identifier (str): native id of the item to access

        Returns:
            data (str): text associated with the given identifier
        """
        #TODO more elegant way to add NameSpace (.register_namespace maybe??)
        ns_prefix = '<mzML xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="test_Creinhardtii_QE_pH8" version="1.1.0" xmlns="http://psi.hupo.org/ms/mzml">'
        ns_suffix = '</mzML>'
        data     = self.Reader.read_block(identifier)
        element  = XML(ns_prefix + data.decode('utf-8') + ns_suffix)
        if 'chromatogram' in element[0].tag:
            return spec.Chromatogram(list(element)[0], measured_precision=5e-6)
        else:
            return spec.Spectrum(list(element)[0], measured_precision=5e-6)
Beispiel #9
0
    def _interpol_search(self,
                         target_index,
                         chunk_size=8,
                         fallback_cutoff=100):
        """
        Use linear interpolation search to find spectra faster.

        Arguments:
            target_index (str or int) : native id of the item to access

        Keyword Arguments:
            chunk_size (int)        : size of the chunk to read in one go in kb

        """
        # print('target ', target_index)
        seeker = open(self.path, 'rb')
        seeker.seek(0, 2)
        chunk_size = chunk_size * 512
        lower_bound = 0
        upper_bound = seeker.tell()
        mid = int(upper_bound / 2)
        seeker.seek(mid, 0)
        current_position = seeker.tell()
        used_indices = set()
        spectrum_found = False
        spectrum = None
        while spectrum_found is False:
            jumper_scaling = 1
            file_pointer = seeker.tell()
            data = seeker.read(chunk_size)
            spec_start = self.spec_open.search(data)
            if spec_start is not None:
                spec_start_offset = file_pointer + spec_start.start()
                seeker.seek(spec_start_offset)
                current_index = int(
                    re.search(b'[0-9]*$', spec_start.group('id')).group())

                self.offset_dict[current_index] = (spec_start_offset, )
                if current_index in used_indices:
                    # seeker.close()
                    if current_index > target_index:
                        jumper_scaling -= 0.1
                    else:
                        jumper_scaling += 0.1

                used_indices.add(current_index)

                dist = current_index - target_index
                if dist < -1 and dist > -(fallback_cutoff):
                    spectrum = self._search_linear(seeker, target_index)
                    seeker.close()
                    spectrum_found = True
                    break
                elif dist > 0 and dist < fallback_cutoff:
                    while current_index > target_index:
                        offset = int(current_position - chunk_size)
                        seeker.seek(offset if offset > 0 else 0)
                        lower_bound = current_position
                        current_position = seeker.tell()
                        data = seeker.read(chunk_size)
                        if self.spec_open.search(data):
                            spec_start = self.spec_open.search(data)
                            current_index = int(
                                re.search(b'[0-9]*$',
                                          spec_start.group('id')).group())
                    seeker.seek(current_position)
                    spectrum = self._search_linear(seeker, target_index)
                    seeker.close()
                    spectrum_found = True
                    break

                if int(current_index) == target_index:

                    seeker.seek(spec_start_offset)
                    start, end = self._read_to_spec_end(seeker)
                    seeker.seek(start)
                    self.offset_dict[current_index] = (start, end)
                    xml_string = seeker.read(end - start)
                    seeker.close()
                    spectrum = spec.Spectrum(XML(xml_string),
                                             measured_precision=5e-6)
                    spectrum_found = True
                    break

                elif int(current_index) > target_index:
                    scaling = target_index / current_index
                    seeker.seek(
                        int(current_position * scaling * jumper_scaling))
                    upper_bound = current_position
                    current_position = seeker.tell()
                elif int(current_index) < target_index:
                    scaling = target_index / current_index
                    seeker.seek(
                        int(current_position * scaling * jumper_scaling))
                    lower_bound = current_position
                    current_position = seeker.tell()

            elif len(data) == 0:
                sorted_keys = sorted(self.offset_dict.keys())
                pos = bisect.bisect_left(
                    sorted_keys, target_index) - 2  # dat magic number :)
                try:
                    key = sorted_keys[pos]
                    spec_start_offset = self.offset_dict[key][0]
                except:
                    key = sorted_keys[pos]
                    spec_start_offset = self.offset_dict[key][0]
                seeker = open(self.path, 'rb')
                seeker.seek(spec_start_offset)
                spectrum = self._search_linear(seeker, target_index)
                seeker.close()
                spectrum_found = True
                break

        return spectrum