def load_lxml_structured_document(filename, page_range=None): with FileSystems.open(filename) as f: structured_document = LxmlStructuredDocument(etree.parse(f).getroot()) if page_range: structured_document = LxmlStructuredDocument( E.DOCUMENT( *structured_document.get_pages()[max(0, page_range[0] - 1):page_range[1]])) return structured_document
def test_should_find_pages(self): pages = [ E.PAGE(), E.PAGE() ] doc = LxmlStructuredDocument( E.DOCUMENT( *pages ) ) assert list(doc.get_pages()) == pages