def _get_doc_by_raw_offset(self, doc_id): """ Load document from xml using bytes offset information. XXX: this is not tested under Windows. """ bounds = self._get_meta()[str(doc_id)].bounds return xml_utils.load_chunk(self.filename, bounds)
def _get_doc_by_line_offset(self, doc_id): """ Load document from xml using line offset information. This is much slower than _get_doc_by_raw_offset but should work everywhere. """ bounds = self._get_meta()[str(doc_id)].bounds return xml_utils.load_chunk(self.filename, bounds, slow=True)
def _compute_document_meta(self): """ Returns documents meta information that can be used for fast document lookups. Meta information consists of documents titles, categories and positions in file. """ meta = compat.OrderedDict() bounds_iter = xml_utils.bounds(self.filename, r'<text id="(\d+)"[^>]*name="([^"]*)"', r'</text>', ) for match, bounds in bounds_iter: doc_id, title = str(match.group(1)), match.group(2) title = xml_utils.unescape_attribute(title) # cache categories xml = xml_utils.load_chunk(self.filename, bounds) doc = Document(compat.ElementTree.XML(xml.encode('utf8'))) meta[doc_id] = _DocumentMeta(title, bounds, doc.categories()) return meta