Beispiel #1
0
 def test_should_find_lines_of_page_with_blocks(self):
     lines = [E.TEXT(), E.TEXT()]
     page = E.PAGE(E.BLOCK(*lines))
     doc = LxmlStructuredDocument(
         E.DOCUMENT(
             page,
             # add another page just for effect
             E.PAGE(E.BLOCK(E.TEXT()))))
     assert list(doc.get_lines_of_page(page)) == lines
Beispiel #2
0
 def test_should_not_fail_setting_empty_tag_to_none(self):
     token = E.TEXT()
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token))))
     doc.set_tag(token, None)
     doc.set_tag(token, None, scope=SCOPE_1)
     assert doc.get_tag(token) is None
     assert doc.get_tag(token, scope=SCOPE_1) is None
Beispiel #3
0
 def test_should_return_all_tag_by_scope(self):
     token = E.TEXT()
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token))))
     doc.set_tag(token, TAG_1)
     doc.set_tag(token, TAG_2, scope=SCOPE_1)
     assert doc.get_tag(token) == TAG_1
     assert doc.get_tag(token, scope=SCOPE_1) == TAG_2
     assert doc.get_tag_by_scope(token) == {None: TAG_1, SCOPE_1: TAG_2}
 def test_should_keep_text_block_structure_with_block(self):
     lxml_root = E.DOCUMENT(
         E.PAGE(
             E.BLOCK(
                 E.TEXT(
                     E.TOKEN(
                         SOME_TEXT,
                         dict_extend(COMMON_LXML_TOKEN_ATTRIBS,
                                     {LXML.BASE: SOME_BASE}))))))
     svg_pages = list(iter_svg_pages_for_lxml(lxml_root))
     assert len(svg_pages) == 1
     first_page = svg_pages[0]
     svg_text = first_page.find('.//' + SVG_TEXT)
     assert svg_text is not None
     assert svg_text.getparent().tag == SVG_G
     assert svg_text.getparent().getparent().tag == SVG_G
     assert svg_text.getparent().getparent().getparent().tag == SVG_DOC
Beispiel #5
0
 def test_should_set_tag_with_level(self):
     token = E.TEXT()
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token))))
     doc.set_tag(token, TAG_1, level=2)
     assert doc.get_tag(token, level=2) == TAG_1
     assert doc.get_tag(token) is None
Beispiel #6
0
 def test_should_set_tag_with_scope(self):
     token = E.TEXT()
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token))))
     doc.set_tag(token, TAG_1, scope=SCOPE_1)
     assert doc.get_tag(token, scope=SCOPE_1) == TAG_1
     assert doc.get_tag(token) is None