Beispiel #1
0
 def test_should_use_viewbox_if_available(self):
     bounding_box = BoundingBox(11, 12, 101, 102)
     page = E.svg({
         SVG_VIEWBOX_ATTRIB: format_bounding_box(bounding_box)
     })
     doc = SvgStructuredDocument(page)
     assert doc.get_bounding_box(page) == bounding_box
Beispiel #2
0
 def test_should_not_return_bounding_box_if_font_size_is_missing(self):
     text = SVG_TEXT({
         'x': '10',
         'y': '11'
     })
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text)))
     assert doc.get_bounding_box(text) is None
 def test_should_be_able_to_set_bounding_box(self):
     bounding_box = BoundingBox(11, 12, 101, 102)
     text = SVG_TEXT('a', {'x': '10', 'y': '11', 'font-size': '100'})
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text)))
     doc.set_bounding_box(text, bounding_box)
     assert text.attrib[SVGE_BOUNDING_BOX] == format_bounding_box(
         bounding_box)
Beispiel #4
0
 def test_should_calculate_default_bounding_box(self):
     text = SVG_TEXT('a', {
         'x': '10',
         'y': '11',
         'font-size': '100'
     })
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text)))
     assert doc.get_bounding_box(text) == BoundingBox(10, 11, 100 * 0.8, 100)
Beispiel #5
0
 def test_should_tag_text_as_line_no(self):
     text = SVG_TEXT()
     doc = SvgStructuredDocument(
         E.svg(
             SVG_TEXT_LINE(text)
         )
     )
     doc.set_tag(text, SvgStyleClasses.LINE_NO)
     assert text.attrib['class'] == SvgStyleClasses.LINE_NO
Beispiel #6
0
 def test_should_use_bounding_box_if_available(self):
     bounding_box = BoundingBox(11, 12, 101, 102)
     text = SVG_TEXT('a', {
         'x': '10',
         'y': '11',
         'font-size': '100',
         SVGE_BOUNDING_BOX: format_bounding_box(bounding_box)
     })
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text)))
     assert doc.get_bounding_box(text) == bounding_box
Beispiel #7
0
def convert(args):
    logger = get_logger()
    svg_filename_pattern = args.svg_path
    if not svg_filename_pattern:
        svg_filename_pattern = svg_pattern_for_lxml_path(args.lxml_path)
    logger.debug('svg_filename_pattern: %s', svg_filename_pattern)
    lxml_root = etree.parse(args.lxml_path).getroot()

    match_detail_reporter = None
    if args.annotate:
        annotators = DEFAULT_ANNOTATORS
        if args.debug_match:
            match_detail_reporter = CsvMatchDetailReporter(
                open_csv_output(args.debug_match), args.debug_match)
        if args.xml_path:
            xml_mapping = parse_xml_mapping(args.xml_mapping_path)
            target_annotations = xml_root_to_target_annotations(
                etree.parse(args.xml_path).getroot(), xml_mapping)
            annotators = annotators + [
                MatchingAnnotator(target_annotations,
                                  match_detail_reporter=match_detail_reporter,
                                  use_tag_begin_prefix=True)
            ]
        annotator = Annotator(annotators)
    else:
        annotator = None

    if annotator:
        svg_roots = list(iter_svg_pages_for_lxml(lxml_root))
        annotator.annotate(SvgStructuredDocument(svg_roots))
    else:
        svg_roots = iter_svg_pages_for_lxml(lxml_root)
    for page_index, svg_root in enumerate(svg_roots):
        if annotator:
            svg_root = visualize_svg_annotations(svg_root)
        svg_filename = svg_filename_pattern.format(1 + page_index)
        logger.info('writing to: %s', svg_filename)
        with open(svg_filename, 'wb') as f:
            etree.ElementTree(svg_root).write(f, pretty_print=True)
    if annotator:
        tagging_evaluation_results = evaluate_document_by_page(
            SvgStructuredDocument(svg_roots))
        logger.info(
            'tagging evaluation:\n%s', '\n'.join([
                'page{}: {}'.format(1 + i, r)
                for i, r in enumerate(tagging_evaluation_results)
            ]))
        if args.annotation_evaluation_csv:
            write_dict_csv(
                args.annotation_evaluation_csv, DEFAULT_EVALUATION_COLUMNS,
                to_annotation_evaluation_csv_dict_rows(
                    tagging_evaluation_results,
                    document=os.path.basename(args.lxml_path)))
    if match_detail_reporter:
        match_detail_reporter.close()
Beispiel #8
0
 def test_should_estimate_width_based_on_number_of_characters(self):
     s = 'abc'
     text = SVG_TEXT(s, {
         'x': '10',
         'y': '11',
         'font-size': '100'
     })
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text)))
     assert doc.get_bounding_box(text) == BoundingBox(
         10, 11, 100 * 0.8 * len(s), 100
     )
Beispiel #9
0
 def test_should_return_all_tag_by_scope(self):
     token = SVG_TEXT('test')
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(token)))
     doc.set_tag(token, TAG_1)
     doc.set_tag(token, TAG_2, scope=SCOPE_1)
     assert doc.get_tag(token) == TAG_1
     assert doc.get_tag(token, scope=SCOPE_1) == TAG_2
     assert doc.get_tag_by_scope(token) == {None: TAG_1, SCOPE_1: TAG_2}
Beispiel #10
0
 def test_should_find_lines_of_page_without_blocks(self):
     lines = [
         SVG_TEXT_LINE(),
         SVG_TEXT_LINE()
     ]
     doc = SvgStructuredDocument(
         E.svg(
             *lines
         )
     )
     page = doc.get_pages()[0]
     assert list(doc.get_lines_of_page(page)) == lines
Beispiel #11
0
 def test_should_find_tokens_of_line(self):
     tokens = [
         SVG_TEXT(),
         SVG_TEXT()
     ]
     line = SVG_TEXT_LINE(*tokens)
     doc = SvgStructuredDocument(
         E.svg(
             line,
             SVG_TEXT_LINE(SVG_TEXT())
         )
     )
     assert list(doc.get_tokens_of_line(line)) == tokens
Beispiel #12
0
 def test_should_not_fail_setting_empty_tag_to_none(self):
     token = SVG_TEXT('test')
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(token)))
     doc.set_tag(token, None)
     doc.set_tag(token, None, scope=SCOPE_1)
     assert doc.get_tag(token) is None
     assert doc.get_tag(token, scope=SCOPE_1) is None
Beispiel #13
0
 def test_should_call_save_svg_structured_document(self):
     structured_document = SvgStructuredDocument(E.svg)
     m = structured_document_saver
     with patch.object(m, 'save_svg_structured_document'
                       ) as save_svg_structured_document_mock:
         save_structured_document(FILE_1, structured_document)
         save_svg_structured_document_mock.assert_called_with(
             FILE_1, structured_document)
 def test_should_call_save_pages(self):
     m = structured_document_saver
     root = E.svg()
     with patch.object(m, 'save_pages') as save_pages:
         with patch.object(m, 'etree') as etree:
             save_svg_structured_document(FILE_1, SvgStructuredDocument(root))
             save_pages.assert_called_with(FILE_1, '.svg', ANY)
             args, _ = save_pages.call_args
             assert list(args[2]) == [etree.tostring(root)]
Beispiel #15
0
def svg_page_to_blockified_png_bytes(svg_page, color_map, image_size=None):
    structured_document = SvgStructuredDocument(svg_page)
    blocks = expand_blocks(
        merge_blocks(
            annotation_document_page_to_annotation_blocks(
                structured_document,
                structured_document.get_pages()[0]
            )
        )
    )
    viewbox = svg_page.attrib.get('viewBox')
    if not viewbox:
        raise RuntimeError(
            'viewbox missing on svg, available attributes: %s' % svg_page.attrib.keys()
        )
    _, _, width, height = viewbox.split()
    image = annotated_blocks_to_image(
        blocks, color_map,
        width=float(width), height=float(height), background='white',
        scale_to_size=image_size
    )
    out = BytesIO()
    image.save(out, 'png')
    return out.getvalue()
Beispiel #16
0
def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, name=None):
    stop_watch_recorder = StopWatchRecorder()

    stop_watch_recorder.start('parse lxml')
    lxml_root = etree.fromstring(lxml_content)

    # use a more lenient way to parse xml as xml errors are not uncomment
    stop_watch_recorder.start('parse xml')
    xml_root = xml_from_string_with_recover(xml_content)

    stop_watch_recorder.start('extract target annotations')
    target_annotations = xml_root_to_target_annotations(
        xml_root,
        xml_mapping
    )
    stop_watch_recorder.stop()

    annotators = DEFAULT_ANNOTATORS + [MatchingAnnotator(
        target_annotations,
        use_tag_begin_prefix=True
    )]
    annotator = Annotator(annotators)

    stop_watch_recorder.start('convert to svg')
    svg_roots = list(iter_svg_pages_for_lxml(lxml_root))

    stop_watch_recorder.start('annotate svg')
    annotator.annotate(SvgStructuredDocument(svg_roots))

    stop_watch_recorder.start('add visualisation')
    svg_roots = [
        visualize_svg_annotations(svg_root)
        for svg_root in svg_roots
    ]
    stop_watch_recorder.stop()

    get_logger().info(
        'processed: name=%s, lxml size=%s, xml size=%s, timings=[%s] (native align impl=%s)',
        name, format(len(lxml_content), ','), format(len(xml_content), ','),
        stop_watch_recorder, align_native_enabled
    )

    return svg_roots
def configure_pipeline(p, opt):
    image_size = ((opt.image_width, opt.image_height)
                  if opt.image_width and opt.image_height else None)
    page_range = opt.pages
    first_page = page_range[0] if page_range else 1
    xml_mapping = parse_xml_mapping(opt.xml_mapping_path)
    if opt.lxml_path:
        lxml_xml_file_pairs = (
            p | beam.Create(
                [[
                    join_if_relative_path(opt.base_data_path, s)
                    for s in [opt.lxml_path, opt.xml_path]
                ]]) | "FindFilePairs" >> TransformAndLog(
                    beam.FlatMap(lambda patterns: islice(
                        find_file_pairs_grouped_by_parent_directory_or_name(
                            patterns), opt.limit)),
                    log_prefix='file pairs: ',
                    log_level='debug') | PreventFusion()
            | "ReadFileContent" >> beam.Map(
                lambda filenames: {
                    'source_filename': filenames[0],
                    'xml_filename': filenames[1],
                    'lxml_content': read_all_from_path(filenames[0]),
                    'xml_content': read_all_from_path(filenames[1])
                }))
    elif opt.pdf_path or opt.pdf_xml_file_list:
        if opt.pdf_xml_file_list:
            pdf_xml_url_pairs = (
                p | "ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list,
                                                      limit=opt.limit)
                | "TranslateFilePairUrls" >>
                beam.Map(lambda row: (row['source_url'], row['xml_url'])))
        else:
            pdf_xml_url_pairs = (p | beam.Create([[
                join_if_relative_path(opt.base_data_path, s)
                for s in [opt.pdf_path, opt.xml_path]
            ]]) | "FindFilePairs" >> TransformAndLog(
                beam.FlatMap(lambda patterns: islice(
                    find_file_pairs_grouped_by_parent_directory_or_name(
                        patterns), opt.limit)),
                log_prefix='file pairs: ',
                log_level='debug'))
        pdf_xml_file_pairs = (
            pdf_xml_url_pairs | PreventFusion()
            | "ReadFileContent" >> TransformAndCount(
                beam.Map(
                    lambda filenames: {
                        'source_filename': filenames[0],
                        'xml_filename': filenames[1],
                        'pdf_content': read_all_from_path(filenames[0]),
                        'xml_content': read_all_from_path(filenames[1])
                    }), MetricCounters.FILE_PAIR))

        lxml_xml_file_pairs = (
            pdf_xml_file_pairs | "ConvertPdfToLxml" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'lxml_content':
                            convert_pdf_bytes_to_lxml(v['pdf_content'],
                                                      path=v['source_filename'
                                                             ],
                                                      page_range=page_range)
                        }),
                    # we don't need the pdf_content unless we are writing tf_records
                    None if opt.save_tfrecords else {'pdf_content'}),
                log_fn=lambda e, v: (get_logger().warning(
                    'caught exception (ignoring item): %s, pdf: %s, xml: %s',
                    e,
                    v['source_filename'],
                    v['xml_filename'],
                    exc_info=e)),
                error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR))
    else:
        raise RuntimeError('either lxml-path or pdf-path required')

    if opt.save_png or opt.save_tfrecords:
        with_pdf_png_pages = (
            (lxml_xml_file_pairs if opt.save_tfrecords else pdf_xml_file_pairs)
            | "ConvertPdfToPng" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'pdf_png_pages':
                            list(
                                pdf_bytes_to_png_pages(v['pdf_content'],
                                                       dpi=opt.png_dpi,
                                                       image_size=image_size,
                                                       page_range=page_range))
                        }),
                    {'pdf_content'}  # we no longer need the pdf_content
                ),
                error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR))

        if opt.save_png:
            _ = (with_pdf_png_pages | "SavePdfToPng" >> TransformAndLog(
                beam.Map(lambda v: save_pages(
                    FileSystems.join(
                        opt.output_path,
                        change_ext(
                            relative_path(opt.base_data_path, v[
                                'source_filename']), None, '.png.zip')),
                    '.png', v['pdf_png_pages'])),
                log_fn=lambda x: get_logger().info('saved result: %s', x)))

    if opt.save_lxml:
        _ = (lxml_xml_file_pairs
             | "SaveLxml" >> TransformAndLog(
                 beam.Map(lambda v: save_file_content(
                     FileSystems.join(
                         opt.output_path,
                         change_ext(
                             relative_path(opt.base_data_path, v[
                                 'source_filename']), None, '.lxml.gz')), v[
                                     'lxml_content'])),
                 log_fn=lambda x: get_logger().info('saved lxml: %s', x)))

    annotation_results = ((
        with_pdf_png_pages if opt.save_tfrecords else lxml_xml_file_pairs
    ) | "ConvertLxmlToSvgAndAnnotate" >> TransformAndCount(
        MapOrLog(
            lambda v: remove_keys_from_dict(
                extend_dict(
                    v, {
                        'svg_pages':
                        list(
                            convert_and_annotate_lxml_content(
                                v['lxml_content'],
                                v['xml_content'],
                                xml_mapping,
                                name=v['source_filename']))
                    }),
                # Won't need the XML anymore
                {'lxml_content', 'xml_content'}),
            log_fn=lambda e, v: (get_logger().warning(
                'caught exception (ignoring item): %s, source: %s, xml: %s',
                e,
                v['source_filename'],
                v['xml_filename'],
                exc_info=e)),
            error_count=MetricCounters.CONVERT_LXML_TO_SVG_ANNOT_ERROR),
        MetricCounters.PAGE,
        lambda v: len(v['svg_pages'])))

    if opt.save_svg:
        _ = (annotation_results | "SaveSvgPages" >> TransformAndLog(
            beam.Map(lambda v: save_svg_roots(
                FileSystems.join(
                    opt.output_path,
                    change_ext(
                        relative_path(opt.base_data_path, v['source_filename']
                                      ), None, '.svg.zip')), v['svg_pages'])),
            log_fn=lambda x: get_logger().info('saved result: %s', x)))

    if opt.annotation_evaluation_csv or opt.min_annotation_percentage:
        annotation_evaluation_results = (
            annotation_results | "EvaluateAnnotations" >> TransformAndLog(
                beam.Map(lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'annotation_evaluation':
                            evaluate_document_by_page(
                                SvgStructuredDocument(v['svg_pages']))
                        }), None
                    if opt.min_annotation_percentage else {'svg_pages'})),
                log_fn=lambda x: get_logger().info(
                    'annotation evaluation result: %s: %s', x[
                        'source_filename'], x['annotation_evaluation'])))

    if opt.save_block_png or opt.save_tfrecords:
        color_map = parse_color_map_from_file(opt.color_map)
        with_block_png_pages = (
            (annotation_evaluation_results
             if opt.min_annotation_percentage else annotation_results)
            | "GenerateBlockPng" >> beam.Map(lambda v: remove_keys_from_dict(
                extend_dict(
                    v, {
                        'block_png_pages': [
                            svg_page_to_blockified_png_bytes(
                                svg_page, color_map, image_size=image_size)
                            for svg_page in v['svg_pages']
                        ]
                    }), {'svg_pages'})))

        if opt.save_block_png:
            _ = (with_block_png_pages | "SaveBlockPng" >> TransformAndLog(
                beam.Map(lambda v: save_pages(
                    FileSystems.join(
                        opt.output_path,
                        change_ext(
                            relative_path(opt.base_data_path, v[
                                'source_filename']), None, '.block-png.zip')),
                    '.png', v['block_png_pages'])),
                log_fn=lambda x: get_logger().info('saved result: %s', x)))

        if opt.save_tfrecords:
            if opt.min_annotation_percentage:
                filtered_pages = (
                    with_block_png_pages | "FilterPages" >> TransformAndCount(
                        beam.Map(lambda v: filter_list_props_by_indices(
                            v,
                            get_page_indices_with_min_annotation_percentage(
                                v['annotation_evaluation'], opt.
                                min_annotation_percentage),
                            {'pdf_png_pages', 'block_png_pages'})),
                        MetricCounters.FILTERED_PAGE,
                        lambda v: len(v['block_png_pages'])))
            else:
                filtered_pages = with_block_png_pages
            _ = (filtered_pages | "WriteTFRecords" >> WritePropsToTFRecord(
                FileSystems.join(opt.output_path, 'data'), lambda v: ({
                    'input_uri':
                    v['source_filename'] + '#page%d' % (first_page + i),
                    'input_image':
                    pdf_png_page,
                    'annotation_uri':
                    (v['source_filename'] + '.annot' + '#page%d' %
                     (first_page + i)),
                    'annotation_image':
                    block_png_page,
                    'page_no':
                    first_page + i
                } for i, pdf_png_page, block_png_page in zip(
                    range(len(v['pdf_png_pages'])), v['pdf_png_pages'], v[
                        'block_png_pages']))))

    if opt.annotation_evaluation_csv:
        annotation_evaluation_csv_name, annotation_evaluation_ext = (
            os.path.splitext(opt.annotation_evaluation_csv))
        _ = (  # flake8: noqa
            annotation_evaluation_results | "FlattenAnotationEvaluationResults"
            >> beam.FlatMap(lambda v: to_annotation_evaluation_csv_dict_rows(
                v['annotation_evaluation'],
                document=basename(v['source_filename'])))
            | "WriteAnnotationEvaluationToCsv" >> WriteDictCsv(
                join_if_relative_path(opt.output_path,
                                      annotation_evaluation_csv_name),
                file_name_suffix=annotation_evaluation_ext,
                columns=DEFAULT_EVALUATION_COLUMNS))
Beispiel #18
0
 def test_should_set_tag_with_scope(self):
     token = SVG_TEXT('test')
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(token)))
     doc.set_tag(token, TAG_1, scope=SCOPE_1)
     assert doc.get_tag(token, scope=SCOPE_1) == TAG_1
     assert doc.get_tag(token) is None
Beispiel #19
0
def load_svg_pages_structured_document(filename, page_range=None):
    return SvgStructuredDocument([
        etree.parse(svg_f).getroot()
        for svg_f in load_pages(filename, page_range=page_range)
    ])
Beispiel #20
0
 def test_should_set_tag_with_level(self):
     token = SVG_TEXT('test')
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(token)))
     doc.set_tag(token, TAG_1, level=2)
     assert doc.get_tag(token, level=2) == TAG_1
     assert doc.get_tag(token) is None
Beispiel #21
0
 def test_should_return_root_as_pages(self):
     root = E.svg()
     doc = SvgStructuredDocument(root)
     assert list(doc.get_pages()) == [root]