Beispiel #1
0
 def _get_doc_by_raw_offset(self, doc_id):
     """
     Load document from xml using bytes offset information.
     XXX: this is not tested under Windows.
     """
     bounds = self._get_meta()[str(doc_id)].bounds
     return xml_utils.load_chunk(self.filename, bounds)
Beispiel #2
0
 def _get_doc_by_line_offset(self, doc_id):
     """
     Load document from xml using line offset information.
     This is much slower than _get_doc_by_raw_offset but should
     work everywhere.
     """
     bounds = self._get_meta()[str(doc_id)].bounds
     return xml_utils.load_chunk(self.filename, bounds, slow=True)
Beispiel #3
0
    def _compute_document_meta(self):
        """
        Returns documents meta information that can
        be used for fast document lookups. Meta information
        consists of documents titles, categories and positions
        in file.
        """
        meta = compat.OrderedDict()
        bounds_iter = xml_utils.bounds(self.filename,
            r'<text id="(\d+)"[^>]*name="([^"]*)"',
            r'</text>',
        )
        for match, bounds in bounds_iter:
            doc_id, title = str(match.group(1)), match.group(2)
            title = xml_utils.unescape_attribute(title)

            # cache categories
            xml = xml_utils.load_chunk(self.filename, bounds)
            doc = Document(compat.ElementTree.XML(xml.encode('utf8')))

            meta[doc_id] = _DocumentMeta(title, bounds, doc.categories())
        return meta