コード例 #1
0
    def test_should_always_preserve_specified_existing_tag_when_merging_front(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node([
                    E.note(TITLE_1),
                    E.lb(),
                    E.page(TOKEN_2),
                    E.lb(),
                    E.note(ABSTRACT_1),
                    E.lb()
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(title=TITLE_1,
                                    abstract_node=E.abstract(
                                        E.p(ABSTRACT_1)))))
        main(dict_to_args({
            **test_helper.main_args_dict, 'always-preserve-fields':
            'page'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//text/page') == [TOKEN_2]
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/front') == [TITLE_1, ABSTRACT_1]
コード例 #2
0
    def test_should_not_merge_front_interrupted_by_body_tag(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node(
                    get_tei_nodes_for_text('\n'.join([
                        TITLE_1, 'After title', TEXT_1, 'Before abstract',
                        ABSTRACT_1
                    ])))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(title=TITLE_1,
                                    abstract_node=E.abstract(E.p(ABSTRACT_1)),
                                    body_nodes=[E.sec(E.p(TEXT_1))])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['title', 'abstract', 'body_section_paragraph'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/front') == [TITLE_1, ABSTRACT_1]
        assert get_xpath_text_list(tei_auto_root, '//text/body') == [
            '\n'.join(['After title', TEXT_1, 'Before abstract'])
        ]
コード例 #3
0
    def test_should_auto_annotate_body_and_back_list_item_section_paragraphs(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        target_body_content_nodes = [E.list(E('list-item', E.p(TEXT_1)))]
        target_back_content_nodes = [E.list(E('list-item', E.p(TEXT_2)))]
        tei_text = '\n'.join([
            get_nodes_text(target_body_content_nodes),
            get_nodes_text(target_back_content_nodes)
        ])
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_training_tei_node(get_tei_nodes_for_text(tei_text))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(body_nodes=target_body_content_nodes,
                                    back_nodes=target_back_content_nodes)))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['body_section_paragraph', 'back_section_paragraph'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//body') == [TEXT_1]
        assert get_xpath_text_list(tei_auto_root,
                                   '//div[@type="annex"]') == [TEXT_2]
コード例 #4
0
    def test_should_auto_annotate_body_and_back_section(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        target_body_content_nodes = [
            E.sec(E.title(SECTION_TITLE_1), '\n', E.p(TEXT_1))
        ]
        target_back_content_nodes = [
            E.sec(E.title(SECTION_TITLE_2), '\n', E.p(TEXT_2))
        ]
        body_tei_text = get_nodes_text(target_body_content_nodes)
        back_tei_text = get_nodes_text(target_back_content_nodes)
        tei_text = '\n'.join([body_tei_text, back_tei_text])
        LOGGER.debug('tei_text: %s', tei_text)
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_training_tei_node(get_tei_nodes_for_text(tei_text))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(body_nodes=target_body_content_nodes,
                                    back_nodes=target_back_content_nodes)))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join([
                'body_section_title', 'body_section_paragraph',
                'back_section_title', 'back_section_paragraph'
            ])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//body') == [body_tei_text]
        assert get_xpath_text_list(tei_auto_root,
                                   '//div[@type="annex"]') == [back_tei_text]
コード例 #5
0
    def test_should_not_preserve_exclude_existing_tag_and_use_body_by_default(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(get_segmentation_tei_node([E.page(TOKEN_1)])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(E.article(E.front())))
        main([*test_helper.main_args, '--no-preserve-fields=page'],
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//text/page') == []
        assert get_xpath_text_list(tei_auto_root, '//text/body') == [TOKEN_1]
コード例 #6
0
    def test_should_auto_annotate_and_not_merge_multiple_references(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node(
                    get_tei_nodes_for_lines([
                        TOKEN_1, LABEL_1 + ' ' + REFERENCE_TEXT_1,
                        LABEL_2 + ' ' + REFERENCE_TEXT_2
                    ]))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(reference_nodes=[
                    get_jats_reference_node(LABEL_1, REFERENCE_TEXT_1),
                    get_jats_reference_node(LABEL_2, REFERENCE_TEXT_2)
                ])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'no-preserve-tags':
            True,
            'no-merge-references':
            True,
            'fields':
            'reference',
            'xml-mapping-overrides':
            'reference.use-raw-text=true'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//text/listBibl') == [
            LABEL_1 + ' ' + REFERENCE_TEXT_1, LABEL_2 + ' ' + REFERENCE_TEXT_2
        ]
コード例 #7
0
    def test_should_always_preserve_reference_tag(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        _common_tokens = [TOKEN_2, TOKEN_3]
        _reference_text = ' '.join(_common_tokens) + ' this is a reference 1'
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node([
                    E.note(TOKEN_1),
                    E.lb(),
                    E.listBibl(_reference_text),
                    E.lb()
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(title=' '.join([TOKEN_1] +
                                                   _common_tokens))))
        main(dict_to_args({
            **test_helper.main_args_dict, 'no-preserve-tags': True,
            'always-preserve-fields': 'reference'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/listBibl') == [_reference_text]
コード例 #8
0
    def test_should_auto_annotate_body_and_back_table_label_title_caption_as_body(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        target_body_content_nodes = [
            E(
                'table-wrap', *[
                    E.label(FIGURE_LABEL_1), ' ',
                    E.caption(E.title(SECTION_TITLE_1), '\n', E.p(TEXT_1))
                ])
        ]
        target_back_content_nodes = [
            E(
                'table-wrap', *[
                    E.label(FIGURE_LABEL_2), ' ',
                    E.caption(E.title(SECTION_TITLE_2), '\n', E.p(TEXT_2))
                ])
        ]
        body_tei_text = get_nodes_text(target_body_content_nodes)
        back_tei_text = get_nodes_text(target_back_content_nodes)
        tei_text = '\n'.join([body_tei_text, back_tei_text])
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_training_tei_node(get_tei_nodes_for_text(tei_text))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(body_nodes=target_body_content_nodes,
                                    back_nodes=target_back_content_nodes)))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['table'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//body') == [tei_text]
コード例 #9
0
    def test_should_auto_annotate_appendix_section_as_annex(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        target_back_content_nodes = [
            E(
                'app-group', *[
                    E.title('Appendix'), '\n',
                    E.app(E.title(SECTION_TITLE_2), '\n', E.p(TEXT_2))
                ])
        ]
        tei_text = get_nodes_text(target_back_content_nodes)
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_training_tei_node(get_tei_nodes_for_text(tei_text))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(back_nodes=target_back_content_nodes)))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join([
                'body_section_title', 'body_section_paragraph',
                'back_section_title', 'back_section_paragraph',
                'appendix_group_title', 'appendix'
            ])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//div[@type="annex"]') == [tei_text]
コード例 #10
0
    def test_should_not_preserve_existing_front_body_tag_front_and_use_headnote_for_repeated_text(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node([
                    E.front(*get_tei_nodes_for_lines([
                        'Page header', 'Before title', TITLE_1, 'After title'
                    ])),
                    E.body(*get_tei_nodes_for_lines([
                        'Page header',
                        'Before paragraph',
                        TEXT_1,
                        'After paragraph',
                    ]))
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(title=TITLE_1,
                                    body_nodes=[E.sec(E.p(TEXT_1))])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'no-preserve-fields':
            'front,body',
            'fields':
            ','.join(['title', 'abstract', 'body_section_paragraph'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/note[@place="headnote"]') == [
                                       'Page header', 'Page header'
                                   ]
        assert get_xpath_text_list(tei_auto_root, '//text/front') == [
            '\n'.join(['Before title', TITLE_1])
        ]
        assert get_xpath_text_list(tei_auto_root, '//text/body') == [
            'After title',
            '\n'.join(['Before paragraph', TEXT_1, 'After paragraph'])
        ]
コード例 #11
0
    def test_should_preserve_page_numbers_while_detecting_headnote(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node([
                    E.front('Page header', E.lb()),
                    '\n',
                    E.page('123', E.lb()),
                    '\n',
                    E.front(TITLE_1, E.lb()),
                    '\n',
                    E.body('Page header', E.lb()),
                    '\n',
                    E.page('123', E.lb()),
                    '\n',
                    E.body(TEXT_1, E.lb()),
                    '\n',
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(title=TITLE_1,
                                    body_nodes=[E.sec(E.p(TEXT_1))])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'always-preserve-fields':
            'page',
            'fields':
            ','.join(['title', 'abstract', 'body_section_paragraph'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/page') == ['123', '123']
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/note[@place="headnote"]') == [
                                       'Page header', 'Page header'
                                   ]
        assert get_xpath_text_list(tei_auto_root, '//text/front') == [TITLE_1]
        assert get_xpath_text_list(tei_auto_root, '//text/body') == [TEXT_1]
コード例 #12
0
    def test_should_merge_front_tags_and_include_preceeding_text(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        front_tei_text = '\n'.join(['Before', TITLE_1, 'Other', ABSTRACT_1])
        tei_text = '\n'.join([front_tei_text, 'After'])
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_training_tei_node(get_tei_nodes_for_text(tei_text))))
        test_helper.write_xml_root(
            get_target_xml_node(title=TITLE_1,
                                abstract_node=E.abstract(E.p(ABSTRACT_1))))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['title', 'abstract'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//front') == [front_tei_text]