Ejemplo n.º 1
0
    def test_should_not_merge_front_interrupted_by_body_tag(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node(
                    get_tei_nodes_for_text('\n'.join([
                        TITLE_1, 'After title', TEXT_1, 'Before abstract',
                        ABSTRACT_1
                    ])))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(title=TITLE_1,
                                    abstract_node=E.abstract(E.p(ABSTRACT_1)),
                                    body_nodes=[E.sec(E.p(TEXT_1))])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['title', 'abstract', 'body_section_paragraph'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/front') == [TITLE_1, ABSTRACT_1]
        assert get_xpath_text_list(tei_auto_root, '//text/body') == [
            '\n'.join(['After title', TEXT_1, 'Before abstract'])
        ]
Ejemplo n.º 2
0
    def test_should_always_preserve_specified_existing_tag_when_merging_front(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node([
                    E.note(TITLE_1),
                    E.lb(),
                    E.page(TOKEN_2),
                    E.lb(),
                    E.note(ABSTRACT_1),
                    E.lb()
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(title=TITLE_1,
                                    abstract_node=E.abstract(
                                        E.p(ABSTRACT_1)))))
        main(dict_to_args({
            **test_helper.main_args_dict, 'always-preserve-fields':
            'page'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//text/page') == [TOKEN_2]
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/front') == [TITLE_1, ABSTRACT_1]
Ejemplo n.º 3
0
    def test_should_auto_annotate_and_not_merge_multiple_references(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node(
                    get_tei_nodes_for_lines([
                        TOKEN_1, LABEL_1 + ' ' + REFERENCE_TEXT_1,
                        LABEL_2 + ' ' + REFERENCE_TEXT_2
                    ]))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(reference_nodes=[
                    get_jats_reference_node(LABEL_1, REFERENCE_TEXT_1),
                    get_jats_reference_node(LABEL_2, REFERENCE_TEXT_2)
                ])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'no-preserve-tags':
            True,
            'no-merge-references':
            True,
            'fields':
            'reference',
            'xml-mapping-overrides':
            'reference.use-raw-text=true'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//text/listBibl') == [
            LABEL_1 + ' ' + REFERENCE_TEXT_1, LABEL_2 + ' ' + REFERENCE_TEXT_2
        ]
Ejemplo n.º 4
0
    def test_should_always_preserve_reference_tag(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        _common_tokens = [TOKEN_2, TOKEN_3]
        _reference_text = ' '.join(_common_tokens) + ' this is a reference 1'
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node([
                    E.note(TOKEN_1),
                    E.lb(),
                    E.listBibl(_reference_text),
                    E.lb()
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(title=' '.join([TOKEN_1] +
                                                   _common_tokens))))
        main(dict_to_args({
            **test_helper.main_args_dict, 'no-preserve-tags': True,
            'always-preserve-fields': 'reference'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/listBibl') == [_reference_text]
Ejemplo n.º 5
0
    def test_should_auto_annotate_body_and_back_table_label_title_caption_as_body(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        target_body_content_nodes = [
            E(
                'table-wrap', *[
                    E.label(FIGURE_LABEL_1), ' ',
                    E.caption(E.title(SECTION_TITLE_1), '\n', E.p(TEXT_1))
                ])
        ]
        target_back_content_nodes = [
            E(
                'table-wrap', *[
                    E.label(FIGURE_LABEL_2), ' ',
                    E.caption(E.title(SECTION_TITLE_2), '\n', E.p(TEXT_2))
                ])
        ]
        body_tei_text = get_nodes_text(target_body_content_nodes)
        back_tei_text = get_nodes_text(target_back_content_nodes)
        tei_text = '\n'.join([body_tei_text, back_tei_text])
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_training_tei_node(get_tei_nodes_for_text(tei_text))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(body_nodes=target_body_content_nodes,
                                    back_nodes=target_back_content_nodes)))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['table'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//body') == [tei_text]
Ejemplo n.º 6
0
    def test_should_auto_annotate_appendix_section_as_annex(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        target_back_content_nodes = [
            E(
                'app-group', *[
                    E.title('Appendix'), '\n',
                    E.app(E.title(SECTION_TITLE_2), '\n', E.p(TEXT_2))
                ])
        ]
        tei_text = get_nodes_text(target_back_content_nodes)
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_training_tei_node(get_tei_nodes_for_text(tei_text))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(back_nodes=target_back_content_nodes)))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join([
                'body_section_title', 'body_section_paragraph',
                'back_section_title', 'back_section_paragraph',
                'appendix_group_title', 'appendix'
            ])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//div[@type="annex"]') == [tei_text]
Ejemplo n.º 7
0
    def test_should_auto_annotate_body_and_back_list_item_section_paragraphs(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        target_body_content_nodes = [E.list(E('list-item', E.p(TEXT_1)))]
        target_back_content_nodes = [E.list(E('list-item', E.p(TEXT_2)))]
        tei_text = '\n'.join([
            get_nodes_text(target_body_content_nodes),
            get_nodes_text(target_back_content_nodes)
        ])
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_training_tei_node(get_tei_nodes_for_text(tei_text))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(body_nodes=target_body_content_nodes,
                                    back_nodes=target_back_content_nodes)))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['body_section_paragraph', 'back_section_paragraph'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//body') == [TEXT_1]
        assert get_xpath_text_list(tei_auto_root,
                                   '//div[@type="annex"]') == [TEXT_2]
Ejemplo n.º 8
0
    def test_should_auto_annotate_body_and_back_section(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        target_body_content_nodes = [
            E.sec(E.title(SECTION_TITLE_1), '\n', E.p(TEXT_1))
        ]
        target_back_content_nodes = [
            E.sec(E.title(SECTION_TITLE_2), '\n', E.p(TEXT_2))
        ]
        body_tei_text = get_nodes_text(target_body_content_nodes)
        back_tei_text = get_nodes_text(target_back_content_nodes)
        tei_text = '\n'.join([body_tei_text, back_tei_text])
        LOGGER.debug('tei_text: %s', tei_text)
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_training_tei_node(get_tei_nodes_for_text(tei_text))))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(body_nodes=target_body_content_nodes,
                                    back_nodes=target_back_content_nodes)))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join([
                'body_section_title', 'body_section_paragraph',
                'back_section_title', 'back_section_paragraph'
            ])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//body') == [body_tei_text]
        assert get_xpath_text_list(tei_auto_root,
                                   '//div[@type="annex"]') == [back_tei_text]
Ejemplo n.º 9
0
    def test_should_preserve_existing_tag(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(get_segmentation_tei_node([E.page(TOKEN_1)])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(E.article(E.front())))
        main([*test_helper.main_args], save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//text/page') == [TOKEN_1]
Ejemplo n.º 10
0
    def test_should_auto_annotate_title_as_front(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(get_segmentation_tei_node([E.note(TOKEN_1)])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(get_target_xml_node(title=TOKEN_1)))
        main([*test_helper.main_args], save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//text/front') == [TOKEN_1]
Ejemplo n.º 11
0
    def test_should_process_specific_file(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(get_default_tei_node()))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(get_default_target_xml_node()))
        main(dict_to_args({
            **test_helper.main_args_dict, 'source-base-path': None,
            'source-path': str(test_helper.tei_raw_file_path)
        }),
             save_main_session=False)

        assert test_helper.get_tei_auto_root() is not None
Ejemplo n.º 12
0
 def test_should_run_locally_without_beam_if_workers_more_than_one(
         self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
     test_helper.tei_raw_file_path.write_bytes(
         etree.tostring(get_default_tei_node()))
     test_helper.xml_file_path.write_bytes(
         etree.tostring(get_default_target_xml_node()))
     test_helper.tei_auto_file_path.parent.mkdir()
     test_helper.tei_auto_file_path.write_bytes(b'existing')
     main(dict_to_args({
         **test_helper.main_args_dict, 'num_workers': 2
     }),
          save_main_session=False)
     assert test_helper.get_tei_auto_root() is not None
Ejemplo n.º 13
0
    def test_should_skip_existing_output_file_if_resume_is_enabled(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(get_default_tei_node()))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(get_default_target_xml_node()))
        test_helper.tei_auto_file_path.parent.mkdir()
        test_helper.tei_auto_file_path.write_bytes(b'existing')
        main(dict_to_args({
            **test_helper.main_args_dict, 'resume': True
        }),
             save_main_session=False)

        assert test_helper.tei_auto_file_path.read_bytes() == b'existing'
Ejemplo n.º 14
0
    def test_should_write_debug_match(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper,
            temp_dir: Path):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(get_default_tei_node()))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(get_default_target_xml_node()))
        debug_match_path = temp_dir.joinpath('debug.csv')
        main(dict_to_args({
            **test_helper.main_args_dict, 'matcher':
            MatcherNames.COMPLEX,
            'debug-match':
            str(debug_match_path)
        }),
             save_main_session=False)

        assert debug_match_path.exists()
Ejemplo n.º 15
0
    def test_should_merge_front_tags_and_include_preceeding_text(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        front_tei_text = '\n'.join(['Before', TITLE_1, 'Other', ABSTRACT_1])
        tei_text = '\n'.join([front_tei_text, 'After'])
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_training_tei_node(get_tei_nodes_for_text(tei_text))))
        test_helper.write_xml_root(
            get_target_xml_node(title=TITLE_1,
                                abstract_node=E.abstract(E.p(ABSTRACT_1))))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['title', 'abstract'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//front') == [front_tei_text]
Ejemplo n.º 16
0
    def test_should_filter_out_xml_if_selected_fields_are_not_matching(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper,
            actual_abstract: str, expected_abstract: str, expected_match: bool,
            required_fields: str, relative_failed_output_path: str,
            temp_dir: Path):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node([
                    E.note(TITLE_1),
                    E.lb(),
                    E.note(ABSTRACT_PREFIX_1, E.lb(), actual_abstract)
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(
                    title=TITLE_1,
                    abstract_node=(E.abstract(E.p(expected_abstract))
                                   if expected_abstract else None))))
        failed_output_path = (str(temp_dir / relative_failed_output_path)
                              if relative_failed_output_path else '')
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['title', 'author', 'author_aff', 'abstract']),
            'require-matching-fields':
            ','.join(['abstract']),
            'required-fields':
            required_fields,
            'failed-output-path':
            failed_output_path,
            'matcher':
            'simple'
        }),
             save_main_session=False)

        if not expected_match:
            assert not test_helper.tei_auto_file_path.exists()
            if failed_output_path:
                assert (Path(failed_output_path) /
                        test_helper.tei_auto_file_path.name).exists()
        else:
            assert test_helper.tei_auto_file_path.exists()
Ejemplo n.º 17
0
    def test_should_not_preserve_existing_front_body_tag_front_and_use_headnote_for_repeated_text(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node([
                    E.front(*get_tei_nodes_for_lines([
                        'Page header', 'Before title', TITLE_1, 'After title'
                    ])),
                    E.body(*get_tei_nodes_for_lines([
                        'Page header',
                        'Before paragraph',
                        TEXT_1,
                        'After paragraph',
                    ]))
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(title=TITLE_1,
                                    body_nodes=[E.sec(E.p(TEXT_1))])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'no-preserve-fields':
            'front,body',
            'fields':
            ','.join(['title', 'abstract', 'body_section_paragraph'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/note[@place="headnote"]') == [
                                       'Page header', 'Page header'
                                   ]
        assert get_xpath_text_list(tei_auto_root, '//text/front') == [
            '\n'.join(['Before title', TITLE_1])
        ]
        assert get_xpath_text_list(tei_auto_root, '//text/body') == [
            'After title',
            '\n'.join(['Before paragraph', TEXT_1, 'After paragraph'])
        ]
Ejemplo n.º 18
0
    def test_should_preserve_page_numbers_while_detecting_headnote(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node([
                    E.front('Page header', E.lb()),
                    '\n',
                    E.page('123', E.lb()),
                    '\n',
                    E.front(TITLE_1, E.lb()),
                    '\n',
                    E.body('Page header', E.lb()),
                    '\n',
                    E.page('123', E.lb()),
                    '\n',
                    E.body(TEXT_1, E.lb()),
                    '\n',
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(title=TITLE_1,
                                    body_nodes=[E.sec(E.p(TEXT_1))])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'always-preserve-fields':
            'page',
            'fields':
            ','.join(['title', 'abstract', 'body_section_paragraph'])
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/page') == ['123', '123']
        assert get_xpath_text_list(tei_auto_root,
                                   '//text/note[@place="headnote"]') == [
                                       'Page header', 'Page header'
                                   ]
        assert get_xpath_text_list(tei_auto_root, '//text/front') == [TITLE_1]
        assert get_xpath_text_list(tei_auto_root, '//text/body') == [TEXT_1]
Ejemplo n.º 19
0
    def test_should_always_preserve_specified_existing_tag(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_segmentation_tei_node([
                    E.note(TOKEN_1),
                    E.lb(),
                    E.page(TOKEN_2),
                    E.lb(),
                    E.note(TOKEN_3),
                    E.lb()
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(
                    title=' '.join([TOKEN_1, TOKEN_2, TOKEN_3]))))
        main(dict_to_args({
            **test_helper.main_args_dict, 'always-preserve-fields':
            'page'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text_list(tei_auto_root, '//text/page') == [TOKEN_2]