def test_add_title_with_tags(): svg_root = etree.Element(SVG_DOC, nsmap=SVG_NSMAP) svg_root.append(_create_tagged_text(TAG1)) result_svg = visualize_svg_annotations(svg_root) text_node = result_svg.find(SVG_TEXT + '/title') assert text_node is not None assert text_node.text == TAG1
def test_add_style_block_for_multiple_tags_on_same_node(): svg_root = etree.Element(SVG_DOC, nsmap=SVG_NSMAP) svg_root.append(_create_tagged_text(' '.join([TAG1, TAG2]))) result_svg = visualize_svg_annotations(svg_root) style_block = result_svg.find('style') assert style_block is not None assert style_block.text == style_block_for_tags([TAG1, TAG2])
def convert(args): logger = get_logger() svg_filename_pattern = args.svg_path if not svg_filename_pattern: svg_filename_pattern = svg_pattern_for_lxml_path(args.lxml_path) logger.debug('svg_filename_pattern: %s', svg_filename_pattern) lxml_root = etree.parse(args.lxml_path).getroot() match_detail_reporter = None if args.annotate: annotators = DEFAULT_ANNOTATORS if args.debug_match: match_detail_reporter = CsvMatchDetailReporter( open_csv_output(args.debug_match), args.debug_match) if args.xml_path: xml_mapping = parse_xml_mapping(args.xml_mapping_path) target_annotations = xml_root_to_target_annotations( etree.parse(args.xml_path).getroot(), xml_mapping) annotators = annotators + [ MatchingAnnotator(target_annotations, match_detail_reporter=match_detail_reporter, use_tag_begin_prefix=True) ] annotator = Annotator(annotators) else: annotator = None if annotator: svg_roots = list(iter_svg_pages_for_lxml(lxml_root)) annotator.annotate(SvgStructuredDocument(svg_roots)) else: svg_roots = iter_svg_pages_for_lxml(lxml_root) for page_index, svg_root in enumerate(svg_roots): if annotator: svg_root = visualize_svg_annotations(svg_root) svg_filename = svg_filename_pattern.format(1 + page_index) logger.info('writing to: %s', svg_filename) with open(svg_filename, 'wb') as f: etree.ElementTree(svg_root).write(f, pretty_print=True) if annotator: tagging_evaluation_results = evaluate_document_by_page( SvgStructuredDocument(svg_roots)) logger.info( 'tagging evaluation:\n%s', '\n'.join([ 'page{}: {}'.format(1 + i, r) for i, r in enumerate(tagging_evaluation_results) ])) if args.annotation_evaluation_csv: write_dict_csv( args.annotation_evaluation_csv, DEFAULT_EVALUATION_COLUMNS, to_annotation_evaluation_csv_dict_rows( tagging_evaluation_results, document=os.path.basename(args.lxml_path))) if match_detail_reporter: match_detail_reporter.close()
def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, name=None): stop_watch_recorder = StopWatchRecorder() stop_watch_recorder.start('parse lxml') lxml_root = etree.fromstring(lxml_content) # use a more lenient way to parse xml as xml errors are not uncomment stop_watch_recorder.start('parse xml') xml_root = xml_from_string_with_recover(xml_content) stop_watch_recorder.start('extract target annotations') target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping ) stop_watch_recorder.stop() annotators = DEFAULT_ANNOTATORS + [MatchingAnnotator( target_annotations, use_tag_begin_prefix=True )] annotator = Annotator(annotators) stop_watch_recorder.start('convert to svg') svg_roots = list(iter_svg_pages_for_lxml(lxml_root)) stop_watch_recorder.start('annotate svg') annotator.annotate(SvgStructuredDocument(svg_roots)) stop_watch_recorder.start('add visualisation') svg_roots = [ visualize_svg_annotations(svg_root) for svg_root in svg_roots ] stop_watch_recorder.stop() get_logger().info( 'processed: name=%s, lxml size=%s, xml size=%s, timings=[%s] (native align impl=%s)', name, format(len(lxml_content), ','), format(len(xml_content), ','), stop_watch_recorder, align_native_enabled ) return svg_roots