def test_should_find_lines_of_page_with_blocks(self): lines = [E.TEXT(), E.TEXT()] page = E.PAGE(E.BLOCK(*lines)) doc = LxmlStructuredDocument( E.DOCUMENT( page, # add another page just for effect E.PAGE(E.BLOCK(E.TEXT())))) assert list(doc.get_lines_of_page(page)) == lines
def test_should_not_fail_setting_empty_tag_to_none(self): token = E.TEXT() doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token)))) doc.set_tag(token, None) doc.set_tag(token, None, scope=SCOPE_1) assert doc.get_tag(token) is None assert doc.get_tag(token, scope=SCOPE_1) is None
def test_should_return_all_tag_by_scope(self): token = E.TEXT() doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token)))) doc.set_tag(token, TAG_1) doc.set_tag(token, TAG_2, scope=SCOPE_1) assert doc.get_tag(token) == TAG_1 assert doc.get_tag(token, scope=SCOPE_1) == TAG_2 assert doc.get_tag_by_scope(token) == {None: TAG_1, SCOPE_1: TAG_2}
def test_should_keep_text_block_structure_with_block(self): lxml_root = E.DOCUMENT( E.PAGE( E.BLOCK( E.TEXT( E.TOKEN( SOME_TEXT, dict_extend(COMMON_LXML_TOKEN_ATTRIBS, {LXML.BASE: SOME_BASE})))))) svg_pages = list(iter_svg_pages_for_lxml(lxml_root)) assert len(svg_pages) == 1 first_page = svg_pages[0] svg_text = first_page.find('.//' + SVG_TEXT) assert svg_text is not None assert svg_text.getparent().tag == SVG_G assert svg_text.getparent().getparent().tag == SVG_G assert svg_text.getparent().getparent().getparent().tag == SVG_DOC
def test_should_set_tag_with_level(self): token = E.TEXT() doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token)))) doc.set_tag(token, TAG_1, level=2) assert doc.get_tag(token, level=2) == TAG_1 assert doc.get_tag(token) is None
def test_should_set_tag_with_scope(self): token = E.TEXT() doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token)))) doc.set_tag(token, TAG_1, scope=SCOPE_1) assert doc.get_tag(token, scope=SCOPE_1) == TAG_1 assert doc.get_tag(token) is None