Ejemplo n.º 1
0
 def test_should_return_one_page(self):
     lxml_root = E.DOCUMENT(
         E.PAGE(
         )
     )
     svg_pages = list(iter_svg_pages_for_lxml(lxml_root))
     assert len(svg_pages) == 1
 def test_should_add_background_rect(self):
     lxml_root = E.DOCUMENT(E.PAGE(width='600', height='800'))
     svg_pages = list(iter_svg_pages_for_lxml(lxml_root))
     assert len(svg_pages) == 1
     background_rect = svg_pages[0].xpath('svg:rect[@class="background"]',
                                          namespaces={'svg': SVG_NS})
     assert len(background_rect) == 1
Ejemplo n.º 3
0
 def test_should_create_text_node_with_common_attributes(self):
     lxml_root = E.DOCUMENT(
         E.PAGE(
             E.TEXT(
                 E.TOKEN(
                     SOME_TEXT,
                     COMMON_LXML_TOKEN_ATTRIBS
                 )
             )
         )
     )
     svg_pages = list(iter_svg_pages_for_lxml(lxml_root))
     assert len(svg_pages) == 1
     first_page = svg_pages[0]
     svg_text = first_page.find('.//' + SVG_TEXT)
     assert svg_text is not None
     assert svg_text.text == SOME_TEXT
     assert float(svg_text.attrib[SVG.X]) == float(SOME_X)
     assert float(svg_text.attrib[SVG.Y]) == float(SOME_Y)
     assert float(svg_text.attrib[SVG.FONT_SIZE]) == float(SOME_FONT_SIZE)
     assert svg_text.attrib[SVG.FONT_FAMILY] == SOME_FONT_FAMILY
     assert svg_text.attrib[SVG.FILL] == SOME_FONT_COLOR
     assert parse_bounding_box(svg_text.attrib.get(SVG.BOUNDING_BOX)) == BoundingBox(
         float(COMMON_LXML_TOKEN_ATTRIBS[LXML.X]),
         float(COMMON_LXML_TOKEN_ATTRIBS[LXML.Y]),
         float(COMMON_LXML_TOKEN_ATTRIBS[LXML.WIDTH]),
         float(COMMON_LXML_TOKEN_ATTRIBS[LXML.HEIGHT])
     )
Ejemplo n.º 4
0
 def test_should_set_svg_dimensions(self):
     lxml_root = E.DOCUMENT(
         E.PAGE(
             width='600',
             height='800'
         )
     )
     svg_pages = list(iter_svg_pages_for_lxml(lxml_root))
     assert len(svg_pages) == 1
     assert svg_pages[0].attrib.get('viewBox') == '0 0 600 800'
 def test_should_use_base_as_y_in_svg_if_available(self):
     lxml_root = E.DOCUMENT(
         E.PAGE(
             E.TEXT(
                 E.TOKEN(
                     SOME_TEXT,
                     dict_extend(COMMON_LXML_TOKEN_ATTRIBS,
                                 {LXML.BASE: SOME_BASE})))))
     svg_pages = list(iter_svg_pages_for_lxml(lxml_root))
     assert len(svg_pages) == 1
     first_page = svg_pages[0]
     svg_text = first_page.find('.//' + SVG_TEXT)
     assert float(svg_text.attrib[SVG.Y]) == float(SOME_BASE)
 def test_should_keep_text_block_structure_without_block(self):
     lxml_root = E.DOCUMENT(
         E.PAGE(
             E.TEXT(
                 E.TOKEN(
                     SOME_TEXT,
                     dict_extend(COMMON_LXML_TOKEN_ATTRIBS,
                                 {LXML.BASE: SOME_BASE})))))
     svg_pages = list(iter_svg_pages_for_lxml(lxml_root))
     assert len(svg_pages) == 1
     first_page = svg_pages[0]
     svg_text = first_page.find('.//' + SVG_TEXT)
     assert svg_text is not None
     assert svg_text.getparent().tag == SVG_G
     assert svg_text.getparent().getparent().tag == SVG_DOC
Ejemplo n.º 7
0
def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, name=None):
    stop_watch_recorder = StopWatchRecorder()

    stop_watch_recorder.start('parse lxml')
    lxml_root = etree.fromstring(lxml_content)

    # use a more lenient way to parse xml as xml errors are not uncomment
    stop_watch_recorder.start('parse xml')
    xml_root = xml_from_string_with_recover(xml_content)

    stop_watch_recorder.start('extract target annotations')
    target_annotations = xml_root_to_target_annotations(
        xml_root,
        xml_mapping
    )
    stop_watch_recorder.stop()

    annotators = DEFAULT_ANNOTATORS + [MatchingAnnotator(
        target_annotations,
        use_tag_begin_prefix=True
    )]
    annotator = Annotator(annotators)

    stop_watch_recorder.start('convert to svg')
    svg_roots = list(iter_svg_pages_for_lxml(lxml_root))

    stop_watch_recorder.start('annotate svg')
    annotator.annotate(SvgStructuredDocument(svg_roots))

    stop_watch_recorder.start('add visualisation')
    svg_roots = [
        visualize_svg_annotations(svg_root)
        for svg_root in svg_roots
    ]
    stop_watch_recorder.stop()

    get_logger().info(
        'processed: name=%s, lxml size=%s, xml size=%s, timings=[%s] (native align impl=%s)',
        name, format(len(lxml_content), ','), format(len(xml_content), ','),
        stop_watch_recorder, align_native_enabled
    )

    return svg_roots