def _search_string_identifier(self, search_string, chunk_size=8): with open(self.path, 'rb') as seeker: data = None total_chunk_size = chunk_size * 512 spec_start = None # NOTE: This needs to go intp regex_patterns.py regex_string = re.compile( "<\s*spectrum[^>]*index=\"[0-9]+\"\sid=\"({0})\"\sdefaultArrayLength=\"[0-9]+\">" .format("".join(['.*', search_string, '.*'])).encode()) search_string = search_string.encode() while True: file_pointer = seeker.tell() data = seeker.read(total_chunk_size) string, seeker = self._read_until_tag_end(seeker, byte_mode=True) data += string spec_start = regex_string.search(data) if spec_start: spec_start_offset = file_pointer + spec_start.start() current_index = spec_start.group(1) if search_string in current_index: seeker.seek(spec_start_offset) start, end = self._read_to_spec_end(seeker) seeker.seek(start) spec_string = seeker.read(end - start) xml_string = XML(spec_string) return spec.Spectrum(xml_string, measured_precision=5e-6) elif len(data) == 0: raise Exception('cant find specified string')
def __getitem__(self, identifier): """ Access the item with id 'identifier' in the file by iterating the xml-tree. Arguments: identifier (str): native id of the item to access Returns: data (str): text associated with the given identifier """ old_pos = self.file_handler.tell() self.file_handler.seek(0, 0) mzml_iter = iter(iterparse(self.file_handler, events=['end'])) while True: event, element = next(mzml_iter) if event == 'end': if element.tag.endswith('}spectrum'): if int( regex_patterns.SPECTRUM_ID_PATTERN.search( element.get('id')).group(0)) == identifier: self.file_handler.seek(old_pos, 0) return spec.Spectrum(element, measured_precision=5e-6) elif element.tag.endswith('}chromatogram'): if element.get('id') == identifier: self.file_handler.seek(old_pos, 0) return spec.Chromatogram(element, measured_precision=5e-6)
def __next__(self): """ Iterator for the class :py:class:`Run`. Iterates all of the spectra in the file. Returns: Spectrum (:py:class:`Spectrum`): a spectrum object with interface to the original spectrum element. Example: >>> for spectrum in Reader: ... print(spectrum.mz, end='\\r') """ has_ref_group = self.info.get('referenceable_param_group_list', False) while True: event, element = next(self.iter, ('END', 'END')) if event == 'end': if element.tag.endswith('}spectrum'): spectrum = spec.Spectrum(element) if has_ref_group: spectrum._set_params_from_reference_group( self.info['referenceable_param_group_list_element'] ) ms_level = spectrum.ms_level spectrum.measured_precision = self.ms_precisions[ms_level] spectrum.calling_instance = self return spectrum elif event == 'END': raise StopIteration
def __next__(self): """ Iterator for the class :py:class:`Run`. Iterates all of the spectra in the file. Returns: Spectrum (:py:class:`Spectrum`): a spectrum object with interface to the original spectrum element. Example: >>> for spectrum in Reader: ... print(spectrum.mz, end='\\r') """ while True: event, element = next(self.iter, ('END', 'END')) if event == 'end': if element.tag.endswith('}spectrum'): spectrum = spec.Spectrum(element) ms_level = spectrum.ms_level spectrum.measured_precision = self.ms_precisions[ms_level] spectrum.calling_instance = self return spectrum elif event == 'END': raise StopIteration
def _search_linear(self, seeker, index, chunk_size=8): """ Fallback to linear search if interpolated search fails. """ data = None i = 0 total_chunk_size = chunk_size * 512 spec_start = None spec_end = None i = 0 # print('target', index) while True: file_pointer = seeker.tell() data = seeker.read(total_chunk_size) string, seeker = self._read_until_tag_end(seeker) data += string spec_start = self.spec_open.search(data) if spec_start: spec_start_offset = file_pointer + spec_start.start() seeker.seek(spec_start_offset) current_index = int( re.search(b'[0-9]*$', spec_start.group('id')).group()) # print(current_index) spec_end = self.spec_close.search(data[spec_start.start():]) if spec_end: spec_end_offset = file_pointer + spec_end.end( ) + spec_start.start() seeker.seek(spec_end_offset) while spec_end is None: file_pointer = seeker.tell() data = seeker.read(total_chunk_size) string, seeker = self._read_until_tag_end(seeker) data += string spec_end = self.spec_close.search(data) if spec_end: spec_end_offset = file_pointer + spec_end.end() self.offset_dict[current_index] = (spec_start_offset, spec_end_offset) seeker.seek(spec_end_offset) break if current_index == index: seeker.seek(spec_start_offset) spec_string = seeker.read(spec_end_offset - spec_start_offset) self.offset_dict[current_index] = (spec_start_offset, spec_end_offset) xml_string = XML(spec_string) seeker.close() return spec.Spectrum(xml_string, measured_precision=5e-6)
def __getitem__(self, identifier): """ Access the item with id 'identifier'. Either use linear, binary or interpolated search. Arguments: identifier (str): native id of the item to access Returns: data (str): text associated with the given identifier """ ############################################################################# # DOES NOT HOLD IF NUMBERS DONT START WITH ONE AND/OR DONT INCREASE BY ONE # # TODO FIXME # ############################################################################# self.file_handler.seek(0) spectrum = None if str(identifier).upper() == 'TIC': # print(str(identifier).upper()) found = False mzmliter = iter(iterparse(self.file_handler, events=['end'])) while found is False: event, element = next(mzmliter, ('STOP', 'STOP')) if event == 'end': if element.tag.endswith('}chromatogram'): if element.get('id') == 'TIC': found = True spectrum = spec.Chromatogram( element, measured_precision=5e-6) elif event == 'STOP': raise StopIteration elif identifier in self.offset_dict: start = self.offset_dict[identifier] with open(self.path, 'rb') as seeker: seeker.seek(start[0]) start, end = self._read_to_spec_end(seeker) self.file_handler.seek(start, 0) data = self.file_handler.read(end - start) spectrum = spec.Spectrum(XML(data), measured_precision=5e-6) elif type(identifier) == str: return self._search_string_identifier(identifier) else: spectrum = self._interpol_search(identifier) return spectrum
def __getitem__(self, key): """ Execute a SQL request, process the data and return a spectrum object. Args: key (str or int): unique identifier for the given spectrum in the database """ self.cursor.execute("SELECT * FROM spectra WHERE id=?", key) ID, element = self.cursor.fetchone() element = et.XML(element) if "spectrum" in element.tag: spectrum = spec.Spectrum(element) elif "chromatogram" in element.tag: spectrum = spec.Chromatogram(element) return spectrum
def __getitem__(self, identifier): """ Access the item with id 'identifier' in the file. Arguments: identifier (str): native id of the item to access Returns: data (str): text associated with the given identifier """ #TODO more elegant way to add NameSpace (.register_namespace maybe??) ns_prefix = '<mzML xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="test_Creinhardtii_QE_pH8" version="1.1.0" xmlns="http://psi.hupo.org/ms/mzml">' ns_suffix = '</mzML>' data = self.Reader.read_block(identifier) element = XML(ns_prefix + data.decode('utf-8') + ns_suffix) if 'chromatogram' in element[0].tag: return spec.Chromatogram(list(element)[0], measured_precision=5e-6) else: return spec.Spectrum(list(element)[0], measured_precision=5e-6)
def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100): """ Use linear interpolation search to find spectra faster. Arguments: target_index (str or int) : native id of the item to access Keyword Arguments: chunk_size (int) : size of the chunk to read in one go in kb """ # print('target ', target_index) seeker = open(self.path, 'rb') seeker.seek(0, 2) chunk_size = chunk_size * 512 lower_bound = 0 upper_bound = seeker.tell() mid = int(upper_bound / 2) seeker.seek(mid, 0) current_position = seeker.tell() used_indices = set() spectrum_found = False spectrum = None while spectrum_found is False: jumper_scaling = 1 file_pointer = seeker.tell() data = seeker.read(chunk_size) spec_start = self.spec_open.search(data) if spec_start is not None: spec_start_offset = file_pointer + spec_start.start() seeker.seek(spec_start_offset) current_index = int( re.search(b'[0-9]*$', spec_start.group('id')).group()) self.offset_dict[current_index] = (spec_start_offset, ) if current_index in used_indices: # seeker.close() if current_index > target_index: jumper_scaling -= 0.1 else: jumper_scaling += 0.1 used_indices.add(current_index) dist = current_index - target_index if dist < -1 and dist > -(fallback_cutoff): spectrum = self._search_linear(seeker, target_index) seeker.close() spectrum_found = True break elif dist > 0 and dist < fallback_cutoff: while current_index > target_index: offset = int(current_position - chunk_size) seeker.seek(offset if offset > 0 else 0) lower_bound = current_position current_position = seeker.tell() data = seeker.read(chunk_size) if self.spec_open.search(data): spec_start = self.spec_open.search(data) current_index = int( re.search(b'[0-9]*$', spec_start.group('id')).group()) seeker.seek(current_position) spectrum = self._search_linear(seeker, target_index) seeker.close() spectrum_found = True break if int(current_index) == target_index: seeker.seek(spec_start_offset) start, end = self._read_to_spec_end(seeker) seeker.seek(start) self.offset_dict[current_index] = (start, end) xml_string = seeker.read(end - start) seeker.close() spectrum = spec.Spectrum(XML(xml_string), measured_precision=5e-6) spectrum_found = True break elif int(current_index) > target_index: scaling = target_index / current_index seeker.seek( int(current_position * scaling * jumper_scaling)) upper_bound = current_position current_position = seeker.tell() elif int(current_index) < target_index: scaling = target_index / current_index seeker.seek( int(current_position * scaling * jumper_scaling)) lower_bound = current_position current_position = seeker.tell() elif len(data) == 0: sorted_keys = sorted(self.offset_dict.keys()) pos = bisect.bisect_left( sorted_keys, target_index) - 2 # dat magic number :) try: key = sorted_keys[pos] spec_start_offset = self.offset_dict[key][0] except: key = sorted_keys[pos] spec_start_offset = self.offset_dict[key][0] seeker = open(self.path, 'rb') seeker.seek(spec_start_offset) spectrum = self._search_linear(seeker, target_index) seeker.close() spectrum_found = True break return spectrum