コード例 #1
0
 def test_should_return_empty_target_annotations_for_no_matching_annotations(
         self):
     xml_root = E.article(E.other(SOME_VALUE))
     xml_mapping = {'article': {TAG1: 'title'}}
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert target_annotations == []
コード例 #2
0
 def test_should_not_apply_range_children_if_value_is_not_integer(self):
     fpage = 'abc'
     lpage = 'xyz'
     xml_root = E.article(
         E.entry(E.child1(SOME_VALUE), E.fpage(fpage), E.lpage(lpage)))
     xml_mapping = {
         'article': {
             TAG1:
             'entry',
             TAG1 + XmlMappingSuffix.CHILDREN:
             'fpage|lpage',
             TAG1 + XmlMappingSuffix.CHILDREN_RANGE:
             json.dumps([{
                 'min': {
                     'xpath': 'fpage'
                 },
                 'max': {
                     'xpath': 'lpage'
                 }
             }])
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [(t.name, t.value)
             for t in target_annotations] == [(TAG1, [fpage, lpage])]
コード例 #3
0
 def test_should_apply_range_children_as_separate_target_annotations(self):
     num_values = [101, 102, 103, 104, 105, 106, 107]
     xml_root = E.article(
         E.entry(E.child1(SOME_VALUE), E.fpage(str(min(num_values))),
                 E.lpage(str(max(num_values)))))
     xml_mapping = {
         'article': {
             TAG1:
             'entry',
             TAG1 + XmlMappingSuffix.CHILDREN:
             'fpage|lpage',
             TAG1 + XmlMappingSuffix.CHILDREN_RANGE:
             json.dumps([{
                 'min': {
                     'xpath': 'fpage'
                 },
                 'max': {
                     'xpath': 'lpage'
                 },
                 'standalone': True
             }])
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [(t.name, t.value) for t in target_annotations
             ] == [(TAG1, str(x)) for x in num_values]
コード例 #4
0
 def test_should_not_apply_range_children_if_xpath_not_matching(self):
     num_values = [101, 102, 103, 104, 105, 106, 107]
     fpage = str(min(num_values))
     lpage = str(max(num_values))
     xml_root = E.article(
         E.entry(E.child1(SOME_VALUE), E.fpage(fpage), E.lpage(lpage)))
     xml_mapping = {
         'article': {
             TAG1:
             'entry',
             TAG1 + XmlMappingSuffix.CHILDREN:
             'fpage|unknown',
             TAG1 + XmlMappingSuffix.CHILDREN_RANGE:
             json.dumps([{
                 'min': {
                     'xpath': 'fpage'
                 },
                 'max': {
                     'xpath': 'unknown'
                 }
             }])
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [(t.name, t.value)
             for t in target_annotations] == [(TAG1, fpage)]
コード例 #5
0
 def test_should_not_apply_concat_children_if_one_node_was_not_found(self):
     num_values = ['101', '202']
     xml_root = E.article(
         E.entry(
             E.parent(E.child1(SOME_VALUE), E.fpage(num_values[0]),
                      E.lpage(num_values[1]))))
     xml_mapping = {
         'article': {
             TAG1:
             'entry',
             TAG1 + XmlMappingSuffix.CHILDREN:
             './/*',
             TAG1 + XmlMappingSuffix.CHILDREN_CONCAT:
             json.dumps([[{
                 'xpath': './/fpage'
             }, {
                 'value': '-'
             }, {
                 'xpath': './/unknown'
             }]])
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [(t.name, t.value) for t in target_annotations
             ] == [(TAG1, [SOME_VALUE, num_values[0], num_values[1]])]
コード例 #6
0
 def test_should_return_matching_target_annotations(self):
     xml_root = E.article(E.title(SOME_VALUE))
     xml_mapping = {'article': {TAG1: 'title'}}
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert len(target_annotations) == 1
     assert target_annotations[0].name == TAG1
     assert target_annotations[0].value == SOME_VALUE
コード例 #7
0
 def test_should_strip_extra_space(self):
     xml_root = E.article(E.abstract(SOME_VALUE + '  ' + SOME_VALUE_2))
     xml_mapping = {'article': {TAG1: 'abstract'}}
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert len(target_annotations) == 1
     assert target_annotations[0].name == TAG1
     assert target_annotations[0].value == SOME_VALUE + ' ' + SOME_VALUE_2
コード例 #8
0
 def test_should_return_full_text(self):
     xml_root = E.article(E.title('some ', E.other('embedded'), ' text'))
     xml_mapping = {'article': {TAG1: 'title'}}
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert len(target_annotations) == 1
     assert target_annotations[0].name == TAG1
     assert target_annotations[0].value == 'some embedded text'
コード例 #9
0
def convert(args):
    logger = get_logger()
    svg_filename_pattern = args.svg_path
    if not svg_filename_pattern:
        svg_filename_pattern = svg_pattern_for_lxml_path(args.lxml_path)
    logger.debug('svg_filename_pattern: %s', svg_filename_pattern)
    lxml_root = etree.parse(args.lxml_path).getroot()

    match_detail_reporter = None
    if args.annotate:
        annotators = DEFAULT_ANNOTATORS
        if args.debug_match:
            match_detail_reporter = CsvMatchDetailReporter(
                open_csv_output(args.debug_match), args.debug_match)
        if args.xml_path:
            xml_mapping = parse_xml_mapping(args.xml_mapping_path)
            target_annotations = xml_root_to_target_annotations(
                etree.parse(args.xml_path).getroot(), xml_mapping)
            annotators = annotators + [
                MatchingAnnotator(target_annotations,
                                  match_detail_reporter=match_detail_reporter,
                                  use_tag_begin_prefix=True)
            ]
        annotator = Annotator(annotators)
    else:
        annotator = None

    if annotator:
        svg_roots = list(iter_svg_pages_for_lxml(lxml_root))
        annotator.annotate(SvgStructuredDocument(svg_roots))
    else:
        svg_roots = iter_svg_pages_for_lxml(lxml_root)
    for page_index, svg_root in enumerate(svg_roots):
        if annotator:
            svg_root = visualize_svg_annotations(svg_root)
        svg_filename = svg_filename_pattern.format(1 + page_index)
        logger.info('writing to: %s', svg_filename)
        with open(svg_filename, 'wb') as f:
            etree.ElementTree(svg_root).write(f, pretty_print=True)
    if annotator:
        tagging_evaluation_results = evaluate_document_by_page(
            SvgStructuredDocument(svg_roots))
        logger.info(
            'tagging evaluation:\n%s', '\n'.join([
                'page{}: {}'.format(1 + i, r)
                for i, r in enumerate(tagging_evaluation_results)
            ]))
        if args.annotation_evaluation_csv:
            write_dict_csv(
                args.annotation_evaluation_csv, DEFAULT_EVALUATION_COLUMNS,
                to_annotation_evaluation_csv_dict_rows(
                    tagging_evaluation_results,
                    document=os.path.basename(args.lxml_path)))
    if match_detail_reporter:
        match_detail_reporter.close()
コード例 #10
0
 def test_should_apply_match_require_next_flag(self):
     xml_root = E.article(E.title(SOME_VALUE))
     xml_mapping = {
         'article': {
             TAG1: 'title',
             TAG1 + XmlMappingSuffix.REQUIRE_NEXT: 'true'
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [t.require_next for t in target_annotations] == [True]
コード例 #11
0
 def test_should_apply_match_multiple_flag(self):
     xml_root = E.article(E.title(SOME_VALUE))
     xml_mapping = {
         'article': {
             TAG1: 'title',
             TAG1 + XmlMappingSuffix.MATCH_MULTIPLE: 'true'
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [t.match_multiple for t in target_annotations] == [True]
コード例 #12
0
 def test_should_apply_match_bonding_flag(self):
     xml_root = E.article(E.title(SOME_VALUE))
     xml_mapping = {
         'article': {
             TAG1: 'title',
             TAG1 + XmlMappingSuffix.BONDING: 'true'
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [t.bonding for t in target_annotations] == [True]
コード例 #13
0
 def test_should_extract_single_value_if_its_the_only_value(self):
     xml_root = E.article(E.entry(E.value('12345')))
     xml_mapping = {
         'article': {
             TAG1: 'entry',
             TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*'
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert len(target_annotations) == 1
     assert [(t.name, t.value)
             for t in target_annotations] == [(TAG1, '12345')]
コード例 #14
0
 def test_should_use_multiple_xpaths(self):
     xml_root = E.article(
         E.entry(E.child1(SOME_VALUE), E.child2(SOME_VALUE_2)))
     xml_mapping = {
         'article': {
             TAG1: '\n{}\n{}\n'.format('entry/child1', 'entry/child2')
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [(t.name, t.value)
             for t in target_annotations] == [(TAG1, SOME_VALUE),
                                              (TAG1, SOME_VALUE_2)]
コード例 #15
0
 def test_should_extract_numbers_from_value_after_text(self):
     xml_root = E.article(E.entry(E.value(SOME_VALUE + ' 12345')))
     xml_mapping = {
         'article': {
             TAG1: 'entry',
             TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*'
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert len(target_annotations) == 1
     assert [(t.name, set(t.value)) for t in target_annotations
             ] == [(TAG1, {SOME_VALUE + ' 12345', SOME_VALUE, '12345'})]
コード例 #16
0
 def test_should_apply_regex_to_result(self):
     xml_root = E.article(E.title('1.1. ' + SOME_VALUE))
     xml_mapping = {
         'article': {
             TAG1: 'title',
             TAG1 + XmlMappingSuffix.REGEX: r'(?:\d+\.?)* ?(.*)'
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert len(target_annotations) == 1
     assert target_annotations[0].name == TAG1
     assert target_annotations[0].value == SOME_VALUE
コード例 #17
0
 def test_should_add_sub_annotations_with_multiple_values(self):
     xml_root = E.article(
         E.entry(E.value(SOME_VALUE), E.value(SOME_VALUE_2)))
     xml_mapping = {
         'article': {
             TAG1: 'entry',
             TAG1 + XmlMappingSuffix.SUB + '.value': './value'
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [
         (t.name, t.value) for t in target_annotations[0].sub_annotations
     ] == [('value', SOME_VALUE), ('value', SOME_VALUE_2)]
コード例 #18
0
 def test_should_apply_children_xpaths_and_exclude_parents(self):
     xml_root = E.article(
         E.entry(
             E.parent(E.child2(SOME_LONGER_VALUE),
                      E.child1(SOME_SHORTER_VALUE))))
     xml_mapping = {
         'article': {
             TAG1: 'entry',
             TAG1 + XmlMappingSuffix.CHILDREN: './/*'
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [(t.name, t.value) for t in target_annotations
             ] == [(TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE])]
コード例 #19
0
 def test_should_add_sub_annotations(self):
     xml_root = E.article(
         E.entry(E.firstname(SOME_VALUE), E.givennames(SOME_VALUE_2)))
     xml_mapping = {
         'article': {
             TAG1: 'entry',
             TAG1 + XmlMappingSuffix.SUB + '.firstname': './firstname',
             TAG1 + XmlMappingSuffix.SUB + '.givennames': './givennames',
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [
         (t.name, t.value) for t in target_annotations[0].sub_annotations
     ] == [('firstname', SOME_VALUE), ('givennames', SOME_VALUE_2)]
コード例 #20
0
 def test_should_return_target_annotations_in_order_of_xml(self):
     xml_root = E.article(
         E.tag1('tag1.1'),
         E.tag2('tag2.1'),
         E.tag1('tag1.2'),
         E.tag2('tag2.2'),
     )
     xml_mapping = {'article': {TAG1: 'tag1', TAG2: 'tag2'}}
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [(ta.name, ta.value)
             for ta in target_annotations] == [(TAG1, 'tag1.1'),
                                               (TAG2, 'tag2.1'),
                                               (TAG1, 'tag1.2'),
                                               (TAG2, 'tag2.2')]
コード例 #21
0
 def test_should_apply_multiple_children_xpaths_and_include_parent_text_if_enabled(
         self):
     xml_root = E.article(
         E.entry(E.child1(SOME_SHORTER_VALUE), SOME_LONGER_VALUE))
     xml_mapping = {
         'article': {
             TAG1: 'entry',
             TAG1 + XmlMappingSuffix.CHILDREN:
             '\n{}\n{}\n'.format('.//*', '.'),
             TAG1 + XmlMappingSuffix.UNMATCHED_PARENT_TEXT: 'true'
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [(t.name, t.value) for t in target_annotations
             ] == [(TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE])]
コード例 #22
0
def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, name=None):
    stop_watch_recorder = StopWatchRecorder()

    stop_watch_recorder.start('parse lxml')
    lxml_root = etree.fromstring(lxml_content)

    # use a more lenient way to parse xml as xml errors are not uncomment
    stop_watch_recorder.start('parse xml')
    xml_root = xml_from_string_with_recover(xml_content)

    stop_watch_recorder.start('extract target annotations')
    target_annotations = xml_root_to_target_annotations(
        xml_root,
        xml_mapping
    )
    stop_watch_recorder.stop()

    annotators = DEFAULT_ANNOTATORS + [MatchingAnnotator(
        target_annotations,
        use_tag_begin_prefix=True
    )]
    annotator = Annotator(annotators)

    stop_watch_recorder.start('convert to svg')
    svg_roots = list(iter_svg_pages_for_lxml(lxml_root))

    stop_watch_recorder.start('annotate svg')
    annotator.annotate(SvgStructuredDocument(svg_roots))

    stop_watch_recorder.start('add visualisation')
    svg_roots = [
        visualize_svg_annotations(svg_root)
        for svg_root in svg_roots
    ]
    stop_watch_recorder.stop()

    get_logger().info(
        'processed: name=%s, lxml size=%s, xml size=%s, timings=[%s] (native align impl=%s)',
        name, format(len(lxml_content), ','), format(len(xml_content), ','),
        stop_watch_recorder, align_native_enabled
    )

    return svg_roots
コード例 #23
0
 def test_should_unnest_extract_value_from_children(self):
     xml_root = E.article(
         E.entry(E.value(SOME_VALUE + ' 12345'),
                 E.value(SOME_VALUE_2 + ' 54321')))
     xml_mapping = {
         'article': {
             TAG1: 'entry',
             TAG1 + XmlMappingSuffix.CHILDREN: r'.//*',
             TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*'
         }
     }
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert len(target_annotations) == 1
     assert [(t.name, set(t.value))
             for t in target_annotations] == [(TAG1, {
                 SOME_VALUE + ' 12345', SOME_VALUE, '12345',
                 SOME_VALUE_2 + ' 54321', SOME_VALUE_2, '54321'
             })]
コード例 #24
0
 def test_should_not_apply_match_require_next_flag_if_not_set(self):
     xml_root = E.article(E.title(SOME_VALUE))
     xml_mapping = {'article': {TAG1: 'title'}}
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert [t.require_next for t in target_annotations] == [False]
コード例 #25
0
 def test_should_return_empty_target_annotations_for_empty_xml(self):
     xml_root = E.article()
     xml_mapping = {'article': {'title': 'title'}}
     target_annotations = xml_root_to_target_annotations(
         xml_root, xml_mapping)
     assert target_annotations == []