def test_should_use_viewbox_if_available(self): bounding_box = BoundingBox(11, 12, 101, 102) page = E.svg({ SVG_VIEWBOX_ATTRIB: format_bounding_box(bounding_box) }) doc = SvgStructuredDocument(page) assert doc.get_bounding_box(page) == bounding_box
def test_should_not_return_bounding_box_if_font_size_is_missing(self): text = SVG_TEXT({ 'x': '10', 'y': '11' }) doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text))) assert doc.get_bounding_box(text) is None
def test_should_be_able_to_set_bounding_box(self): bounding_box = BoundingBox(11, 12, 101, 102) text = SVG_TEXT('a', {'x': '10', 'y': '11', 'font-size': '100'}) doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text))) doc.set_bounding_box(text, bounding_box) assert text.attrib[SVGE_BOUNDING_BOX] == format_bounding_box( bounding_box)
def test_should_calculate_default_bounding_box(self): text = SVG_TEXT('a', { 'x': '10', 'y': '11', 'font-size': '100' }) doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text))) assert doc.get_bounding_box(text) == BoundingBox(10, 11, 100 * 0.8, 100)
def test_should_tag_text_as_line_no(self): text = SVG_TEXT() doc = SvgStructuredDocument( E.svg( SVG_TEXT_LINE(text) ) ) doc.set_tag(text, SvgStyleClasses.LINE_NO) assert text.attrib['class'] == SvgStyleClasses.LINE_NO
def test_should_use_bounding_box_if_available(self): bounding_box = BoundingBox(11, 12, 101, 102) text = SVG_TEXT('a', { 'x': '10', 'y': '11', 'font-size': '100', SVGE_BOUNDING_BOX: format_bounding_box(bounding_box) }) doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text))) assert doc.get_bounding_box(text) == bounding_box
def convert(args): logger = get_logger() svg_filename_pattern = args.svg_path if not svg_filename_pattern: svg_filename_pattern = svg_pattern_for_lxml_path(args.lxml_path) logger.debug('svg_filename_pattern: %s', svg_filename_pattern) lxml_root = etree.parse(args.lxml_path).getroot() match_detail_reporter = None if args.annotate: annotators = DEFAULT_ANNOTATORS if args.debug_match: match_detail_reporter = CsvMatchDetailReporter( open_csv_output(args.debug_match), args.debug_match) if args.xml_path: xml_mapping = parse_xml_mapping(args.xml_mapping_path) target_annotations = xml_root_to_target_annotations( etree.parse(args.xml_path).getroot(), xml_mapping) annotators = annotators + [ MatchingAnnotator(target_annotations, match_detail_reporter=match_detail_reporter, use_tag_begin_prefix=True) ] annotator = Annotator(annotators) else: annotator = None if annotator: svg_roots = list(iter_svg_pages_for_lxml(lxml_root)) annotator.annotate(SvgStructuredDocument(svg_roots)) else: svg_roots = iter_svg_pages_for_lxml(lxml_root) for page_index, svg_root in enumerate(svg_roots): if annotator: svg_root = visualize_svg_annotations(svg_root) svg_filename = svg_filename_pattern.format(1 + page_index) logger.info('writing to: %s', svg_filename) with open(svg_filename, 'wb') as f: etree.ElementTree(svg_root).write(f, pretty_print=True) if annotator: tagging_evaluation_results = evaluate_document_by_page( SvgStructuredDocument(svg_roots)) logger.info( 'tagging evaluation:\n%s', '\n'.join([ 'page{}: {}'.format(1 + i, r) for i, r in enumerate(tagging_evaluation_results) ])) if args.annotation_evaluation_csv: write_dict_csv( args.annotation_evaluation_csv, DEFAULT_EVALUATION_COLUMNS, to_annotation_evaluation_csv_dict_rows( tagging_evaluation_results, document=os.path.basename(args.lxml_path))) if match_detail_reporter: match_detail_reporter.close()
def test_should_estimate_width_based_on_number_of_characters(self): s = 'abc' text = SVG_TEXT(s, { 'x': '10', 'y': '11', 'font-size': '100' }) doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text))) assert doc.get_bounding_box(text) == BoundingBox( 10, 11, 100 * 0.8 * len(s), 100 )
def test_should_return_all_tag_by_scope(self): token = SVG_TEXT('test') doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(token))) doc.set_tag(token, TAG_1) doc.set_tag(token, TAG_2, scope=SCOPE_1) assert doc.get_tag(token) == TAG_1 assert doc.get_tag(token, scope=SCOPE_1) == TAG_2 assert doc.get_tag_by_scope(token) == {None: TAG_1, SCOPE_1: TAG_2}
def test_should_find_lines_of_page_without_blocks(self): lines = [ SVG_TEXT_LINE(), SVG_TEXT_LINE() ] doc = SvgStructuredDocument( E.svg( *lines ) ) page = doc.get_pages()[0] assert list(doc.get_lines_of_page(page)) == lines
def test_should_find_tokens_of_line(self): tokens = [ SVG_TEXT(), SVG_TEXT() ] line = SVG_TEXT_LINE(*tokens) doc = SvgStructuredDocument( E.svg( line, SVG_TEXT_LINE(SVG_TEXT()) ) ) assert list(doc.get_tokens_of_line(line)) == tokens
def test_should_not_fail_setting_empty_tag_to_none(self): token = SVG_TEXT('test') doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(token))) doc.set_tag(token, None) doc.set_tag(token, None, scope=SCOPE_1) assert doc.get_tag(token) is None assert doc.get_tag(token, scope=SCOPE_1) is None
def test_should_call_save_svg_structured_document(self): structured_document = SvgStructuredDocument(E.svg) m = structured_document_saver with patch.object(m, 'save_svg_structured_document' ) as save_svg_structured_document_mock: save_structured_document(FILE_1, structured_document) save_svg_structured_document_mock.assert_called_with( FILE_1, structured_document)
def test_should_call_save_pages(self): m = structured_document_saver root = E.svg() with patch.object(m, 'save_pages') as save_pages: with patch.object(m, 'etree') as etree: save_svg_structured_document(FILE_1, SvgStructuredDocument(root)) save_pages.assert_called_with(FILE_1, '.svg', ANY) args, _ = save_pages.call_args assert list(args[2]) == [etree.tostring(root)]
def svg_page_to_blockified_png_bytes(svg_page, color_map, image_size=None): structured_document = SvgStructuredDocument(svg_page) blocks = expand_blocks( merge_blocks( annotation_document_page_to_annotation_blocks( structured_document, structured_document.get_pages()[0] ) ) ) viewbox = svg_page.attrib.get('viewBox') if not viewbox: raise RuntimeError( 'viewbox missing on svg, available attributes: %s' % svg_page.attrib.keys() ) _, _, width, height = viewbox.split() image = annotated_blocks_to_image( blocks, color_map, width=float(width), height=float(height), background='white', scale_to_size=image_size ) out = BytesIO() image.save(out, 'png') return out.getvalue()
def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, name=None): stop_watch_recorder = StopWatchRecorder() stop_watch_recorder.start('parse lxml') lxml_root = etree.fromstring(lxml_content) # use a more lenient way to parse xml as xml errors are not uncomment stop_watch_recorder.start('parse xml') xml_root = xml_from_string_with_recover(xml_content) stop_watch_recorder.start('extract target annotations') target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping ) stop_watch_recorder.stop() annotators = DEFAULT_ANNOTATORS + [MatchingAnnotator( target_annotations, use_tag_begin_prefix=True )] annotator = Annotator(annotators) stop_watch_recorder.start('convert to svg') svg_roots = list(iter_svg_pages_for_lxml(lxml_root)) stop_watch_recorder.start('annotate svg') annotator.annotate(SvgStructuredDocument(svg_roots)) stop_watch_recorder.start('add visualisation') svg_roots = [ visualize_svg_annotations(svg_root) for svg_root in svg_roots ] stop_watch_recorder.stop() get_logger().info( 'processed: name=%s, lxml size=%s, xml size=%s, timings=[%s] (native align impl=%s)', name, format(len(lxml_content), ','), format(len(xml_content), ','), stop_watch_recorder, align_native_enabled ) return svg_roots
def configure_pipeline(p, opt): image_size = ((opt.image_width, opt.image_height) if opt.image_width and opt.image_height else None) page_range = opt.pages first_page = page_range[0] if page_range else 1 xml_mapping = parse_xml_mapping(opt.xml_mapping_path) if opt.lxml_path: lxml_xml_file_pairs = ( p | beam.Create( [[ join_if_relative_path(opt.base_data_path, s) for s in [opt.lxml_path, opt.xml_path] ]]) | "FindFilePairs" >> TransformAndLog( beam.FlatMap(lambda patterns: islice( find_file_pairs_grouped_by_parent_directory_or_name( patterns), opt.limit)), log_prefix='file pairs: ', log_level='debug') | PreventFusion() | "ReadFileContent" >> beam.Map( lambda filenames: { 'source_filename': filenames[0], 'xml_filename': filenames[1], 'lxml_content': read_all_from_path(filenames[0]), 'xml_content': read_all_from_path(filenames[1]) })) elif opt.pdf_path or opt.pdf_xml_file_list: if opt.pdf_xml_file_list: pdf_xml_url_pairs = ( p | "ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list, limit=opt.limit) | "TranslateFilePairUrls" >> beam.Map(lambda row: (row['source_url'], row['xml_url']))) else: pdf_xml_url_pairs = (p | beam.Create([[ join_if_relative_path(opt.base_data_path, s) for s in [opt.pdf_path, opt.xml_path] ]]) | "FindFilePairs" >> TransformAndLog( beam.FlatMap(lambda patterns: islice( find_file_pairs_grouped_by_parent_directory_or_name( patterns), opt.limit)), log_prefix='file pairs: ', log_level='debug')) pdf_xml_file_pairs = ( pdf_xml_url_pairs | PreventFusion() | "ReadFileContent" >> TransformAndCount( beam.Map( lambda filenames: { 'source_filename': filenames[0], 'xml_filename': filenames[1], 'pdf_content': read_all_from_path(filenames[0]), 'xml_content': read_all_from_path(filenames[1]) }), MetricCounters.FILE_PAIR)) lxml_xml_file_pairs = ( pdf_xml_file_pairs | "ConvertPdfToLxml" >> MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { 'lxml_content': convert_pdf_bytes_to_lxml(v['pdf_content'], path=v['source_filename' ], page_range=page_range) }), # we don't need the pdf_content unless we are writing tf_records None if opt.save_tfrecords else {'pdf_content'}), log_fn=lambda e, v: (get_logger().warning( 'caught exception (ignoring item): %s, pdf: %s, xml: %s', e, v['source_filename'], v['xml_filename'], exc_info=e)), error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR)) else: raise RuntimeError('either lxml-path or pdf-path required') if opt.save_png or opt.save_tfrecords: with_pdf_png_pages = ( (lxml_xml_file_pairs if opt.save_tfrecords else pdf_xml_file_pairs) | "ConvertPdfToPng" >> MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { 'pdf_png_pages': list( pdf_bytes_to_png_pages(v['pdf_content'], dpi=opt.png_dpi, image_size=image_size, page_range=page_range)) }), {'pdf_content'} # we no longer need the pdf_content ), error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR)) if opt.save_png: _ = (with_pdf_png_pages | "SavePdfToPng" >> TransformAndLog( beam.Map(lambda v: save_pages( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v[ 'source_filename']), None, '.png.zip')), '.png', v['pdf_png_pages'])), log_fn=lambda x: get_logger().info('saved result: %s', x))) if opt.save_lxml: _ = (lxml_xml_file_pairs | "SaveLxml" >> TransformAndLog( beam.Map(lambda v: save_file_content( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v[ 'source_filename']), None, '.lxml.gz')), v[ 'lxml_content'])), log_fn=lambda x: get_logger().info('saved lxml: %s', x))) annotation_results = (( with_pdf_png_pages if opt.save_tfrecords else lxml_xml_file_pairs ) | "ConvertLxmlToSvgAndAnnotate" >> TransformAndCount( MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { 'svg_pages': list( convert_and_annotate_lxml_content( v['lxml_content'], v['xml_content'], xml_mapping, name=v['source_filename'])) }), # Won't need the XML anymore {'lxml_content', 'xml_content'}), log_fn=lambda e, v: (get_logger().warning( 'caught exception (ignoring item): %s, source: %s, xml: %s', e, v['source_filename'], v['xml_filename'], exc_info=e)), error_count=MetricCounters.CONVERT_LXML_TO_SVG_ANNOT_ERROR), MetricCounters.PAGE, lambda v: len(v['svg_pages']))) if opt.save_svg: _ = (annotation_results | "SaveSvgPages" >> TransformAndLog( beam.Map(lambda v: save_svg_roots( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v['source_filename'] ), None, '.svg.zip')), v['svg_pages'])), log_fn=lambda x: get_logger().info('saved result: %s', x))) if opt.annotation_evaluation_csv or opt.min_annotation_percentage: annotation_evaluation_results = ( annotation_results | "EvaluateAnnotations" >> TransformAndLog( beam.Map(lambda v: remove_keys_from_dict( extend_dict( v, { 'annotation_evaluation': evaluate_document_by_page( SvgStructuredDocument(v['svg_pages'])) }), None if opt.min_annotation_percentage else {'svg_pages'})), log_fn=lambda x: get_logger().info( 'annotation evaluation result: %s: %s', x[ 'source_filename'], x['annotation_evaluation']))) if opt.save_block_png or opt.save_tfrecords: color_map = parse_color_map_from_file(opt.color_map) with_block_png_pages = ( (annotation_evaluation_results if opt.min_annotation_percentage else annotation_results) | "GenerateBlockPng" >> beam.Map(lambda v: remove_keys_from_dict( extend_dict( v, { 'block_png_pages': [ svg_page_to_blockified_png_bytes( svg_page, color_map, image_size=image_size) for svg_page in v['svg_pages'] ] }), {'svg_pages'}))) if opt.save_block_png: _ = (with_block_png_pages | "SaveBlockPng" >> TransformAndLog( beam.Map(lambda v: save_pages( FileSystems.join( opt.output_path, change_ext( relative_path(opt.base_data_path, v[ 'source_filename']), None, '.block-png.zip')), '.png', v['block_png_pages'])), log_fn=lambda x: get_logger().info('saved result: %s', x))) if opt.save_tfrecords: if opt.min_annotation_percentage: filtered_pages = ( with_block_png_pages | "FilterPages" >> TransformAndCount( beam.Map(lambda v: filter_list_props_by_indices( v, get_page_indices_with_min_annotation_percentage( v['annotation_evaluation'], opt. min_annotation_percentage), {'pdf_png_pages', 'block_png_pages'})), MetricCounters.FILTERED_PAGE, lambda v: len(v['block_png_pages']))) else: filtered_pages = with_block_png_pages _ = (filtered_pages | "WriteTFRecords" >> WritePropsToTFRecord( FileSystems.join(opt.output_path, 'data'), lambda v: ({ 'input_uri': v['source_filename'] + '#page%d' % (first_page + i), 'input_image': pdf_png_page, 'annotation_uri': (v['source_filename'] + '.annot' + '#page%d' % (first_page + i)), 'annotation_image': block_png_page, 'page_no': first_page + i } for i, pdf_png_page, block_png_page in zip( range(len(v['pdf_png_pages'])), v['pdf_png_pages'], v[ 'block_png_pages'])))) if opt.annotation_evaluation_csv: annotation_evaluation_csv_name, annotation_evaluation_ext = ( os.path.splitext(opt.annotation_evaluation_csv)) _ = ( # flake8: noqa annotation_evaluation_results | "FlattenAnotationEvaluationResults" >> beam.FlatMap(lambda v: to_annotation_evaluation_csv_dict_rows( v['annotation_evaluation'], document=basename(v['source_filename']))) | "WriteAnnotationEvaluationToCsv" >> WriteDictCsv( join_if_relative_path(opt.output_path, annotation_evaluation_csv_name), file_name_suffix=annotation_evaluation_ext, columns=DEFAULT_EVALUATION_COLUMNS))
def test_should_set_tag_with_scope(self): token = SVG_TEXT('test') doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(token))) doc.set_tag(token, TAG_1, scope=SCOPE_1) assert doc.get_tag(token, scope=SCOPE_1) == TAG_1 assert doc.get_tag(token) is None
def load_svg_pages_structured_document(filename, page_range=None): return SvgStructuredDocument([ etree.parse(svg_f).getroot() for svg_f in load_pages(filename, page_range=page_range) ])
def test_should_set_tag_with_level(self): token = SVG_TEXT('test') doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(token))) doc.set_tag(token, TAG_1, level=2) assert doc.get_tag(token, level=2) == TAG_1 assert doc.get_tag(token) is None
def test_should_return_root_as_pages(self): root = E.svg() doc = SvgStructuredDocument(root) assert list(doc.get_pages()) == [root]