コード例 #1
0
def test_add_title_with_tags():
    svg_root = etree.Element(SVG_DOC, nsmap=SVG_NSMAP)

    svg_root.append(_create_tagged_text(TAG1))

    result_svg = visualize_svg_annotations(svg_root)
    text_node = result_svg.find(SVG_TEXT + '/title')

    assert text_node is not None
    assert text_node.text == TAG1
コード例 #2
0
def test_add_style_block_for_multiple_tags_on_same_node():
    svg_root = etree.Element(SVG_DOC, nsmap=SVG_NSMAP)

    svg_root.append(_create_tagged_text(' '.join([TAG1, TAG2])))

    result_svg = visualize_svg_annotations(svg_root)
    style_block = result_svg.find('style')

    assert style_block is not None
    assert style_block.text == style_block_for_tags([TAG1, TAG2])
コード例 #3
0
def convert(args):
    logger = get_logger()
    svg_filename_pattern = args.svg_path
    if not svg_filename_pattern:
        svg_filename_pattern = svg_pattern_for_lxml_path(args.lxml_path)
    logger.debug('svg_filename_pattern: %s', svg_filename_pattern)
    lxml_root = etree.parse(args.lxml_path).getroot()

    match_detail_reporter = None
    if args.annotate:
        annotators = DEFAULT_ANNOTATORS
        if args.debug_match:
            match_detail_reporter = CsvMatchDetailReporter(
                open_csv_output(args.debug_match), args.debug_match)
        if args.xml_path:
            xml_mapping = parse_xml_mapping(args.xml_mapping_path)
            target_annotations = xml_root_to_target_annotations(
                etree.parse(args.xml_path).getroot(), xml_mapping)
            annotators = annotators + [
                MatchingAnnotator(target_annotations,
                                  match_detail_reporter=match_detail_reporter,
                                  use_tag_begin_prefix=True)
            ]
        annotator = Annotator(annotators)
    else:
        annotator = None

    if annotator:
        svg_roots = list(iter_svg_pages_for_lxml(lxml_root))
        annotator.annotate(SvgStructuredDocument(svg_roots))
    else:
        svg_roots = iter_svg_pages_for_lxml(lxml_root)
    for page_index, svg_root in enumerate(svg_roots):
        if annotator:
            svg_root = visualize_svg_annotations(svg_root)
        svg_filename = svg_filename_pattern.format(1 + page_index)
        logger.info('writing to: %s', svg_filename)
        with open(svg_filename, 'wb') as f:
            etree.ElementTree(svg_root).write(f, pretty_print=True)
    if annotator:
        tagging_evaluation_results = evaluate_document_by_page(
            SvgStructuredDocument(svg_roots))
        logger.info(
            'tagging evaluation:\n%s', '\n'.join([
                'page{}: {}'.format(1 + i, r)
                for i, r in enumerate(tagging_evaluation_results)
            ]))
        if args.annotation_evaluation_csv:
            write_dict_csv(
                args.annotation_evaluation_csv, DEFAULT_EVALUATION_COLUMNS,
                to_annotation_evaluation_csv_dict_rows(
                    tagging_evaluation_results,
                    document=os.path.basename(args.lxml_path)))
    if match_detail_reporter:
        match_detail_reporter.close()
コード例 #4
0
def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, name=None):
    stop_watch_recorder = StopWatchRecorder()

    stop_watch_recorder.start('parse lxml')
    lxml_root = etree.fromstring(lxml_content)

    # use a more lenient way to parse xml as xml errors are not uncomment
    stop_watch_recorder.start('parse xml')
    xml_root = xml_from_string_with_recover(xml_content)

    stop_watch_recorder.start('extract target annotations')
    target_annotations = xml_root_to_target_annotations(
        xml_root,
        xml_mapping
    )
    stop_watch_recorder.stop()

    annotators = DEFAULT_ANNOTATORS + [MatchingAnnotator(
        target_annotations,
        use_tag_begin_prefix=True
    )]
    annotator = Annotator(annotators)

    stop_watch_recorder.start('convert to svg')
    svg_roots = list(iter_svg_pages_for_lxml(lxml_root))

    stop_watch_recorder.start('annotate svg')
    annotator.annotate(SvgStructuredDocument(svg_roots))

    stop_watch_recorder.start('add visualisation')
    svg_roots = [
        visualize_svg_annotations(svg_root)
        for svg_root in svg_roots
    ]
    stop_watch_recorder.stop()

    get_logger().info(
        'processed: name=%s, lxml size=%s, xml size=%s, timings=[%s] (native align impl=%s)',
        name, format(len(lxml_content), ','), format(len(xml_content), ','),
        stop_watch_recorder, align_native_enabled
    )

    return svg_roots