def test_should_replace_affiliation_with_author_if_single_tokens(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        author_text = 'Mary Maison 1, John Smith 1'
        affiliation_text = '1 University of Science, Smithonia'
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_header_tei_node([
                    E.note(author_text),
                    E.lb(),
                    E.note(affiliation_text),
                    E.lb()
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(author_nodes=[
                    E.contrib(
                        E.name(E.surname('Maison'), E('given-names', 'Mary'))),
                    E.contrib(
                        E.name(E.surname('Smith'), E('given-names', 'John'))),
                    E.aff(E.label('1'), E.institution('University of Science'),
                          E.country('Smithonia'))
                ])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['title', 'author', 'author_aff', 'abstract']),
            'matcher':
            'simple'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text(tei_auto_root,
                              '//byline/docAuthor') == author_text
        assert get_xpath_text(tei_auto_root,
                              '//byline/affiliation') == affiliation_text
Example #2
0
    def test_should_auto_annotate_label_containing_dot_within_reference(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        label_with_dot = '1.'
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_reference_segmenter_tei_node(
                    [E.note(label_with_dot + ' ' + REFERENCE_TEXT_1 + ' ')])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(reference_nodes=[
                    get_jats_reference_node(label_with_dot, REFERENCE_TEXT_1),
                ])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'matcher':
            'simple',
            'fields':
            'reference',
            'xml-mapping-overrides':
            'reference.use-raw-text=true'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text(tei_auto_root, '//listBibl/bibl[1]') == ' '.join(
            [label_with_dot, REFERENCE_TEXT_1])
        assert get_xpath_text(tei_auto_root,
                              '//listBibl/bibl[1]/label') == label_with_dot
    def test_should_auto_annotate_affiliation_preceding_number_using_simple_matcher(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        author_text = 'Mary Maison 1, John Smith 1'
        affiliation_text_1 = '1'
        affiliation_text_2 = 'University of Science, Smithonia'
        affiliation_text = ' '.join([affiliation_text_1, affiliation_text_2])
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_header_tei_node([
                    E.note(TITLE_1),
                    E.lb(),
                    E.note(author_text),
                    E.lb(),
                    E.note(affiliation_text_1),
                    E.lb(),
                    E.note(affiliation_text_2),
                    E.lb(),
                    E.note(ABSTRACT_PREFIX_1, E.lb(), ABSTRACT_1)
                ])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(
                    title=TITLE_1,
                    author_nodes=[
                        E.contrib(
                            E.name(E.surname('Maison'),
                                   E('given-names', 'Mary'))),
                        E.contrib(
                            E.name(E.surname('Smith'),
                                   E('given-names', 'John'))),
                        E.aff(E.institution('University of Science'),
                              E.country('Smithonia'))
                    ],
                    abstract_node=E.abstract(E.p(ABSTRACT_1)))))
        main(dict_to_args({
            **test_helper.main_args_dict, 'fields':
            ','.join(['title', 'author', 'author_aff', 'abstract']),
            'matcher':
            'simple'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text(tei_auto_root, '//docTitle/titlePart') == TITLE_1
        assert get_xpath_text(tei_auto_root,
                              '//byline/docAuthor') == author_text
        assert get_xpath_text(tei_auto_root,
                              '//byline/affiliation') == affiliation_text
        assert get_xpath_text(
            tei_auto_root,
            '//div[@type="abstract"]') == (ABSTRACT_PREFIX_1 + ABSTRACT_1)
    def test_should_not_preserve_sub_tags(self, annotator: MagicMock):
        structured_document = _structured_document_with_sub_elements(
            E.sub1('sub1'), E.sub2('sub2'))
        assert get_xpath_text(_get_root(structured_document),
                              '//sub1') == 'sub1'
        assert get_xpath_text(_get_root(structured_document),
                              '//sub2') == 'sub2'

        annotate_structured_document_inplace(structured_document,
                                             annotator=annotator,
                                             preserve_tags=True,
                                             preserve_sub_tags=False,
                                             fields={'title'})
        assert get_xpath_text(_get_root(structured_document), '//sub1') == ''
        assert get_xpath_text(_get_root(structured_document), '//sub2') == ''
Example #5
0
    def test_should_not_auto_annotate_other_sub_tags(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        target_reference_content_nodes = [
            E('article-title', ARTICLE_TITLE_1),
            ' ',
            E.source(SOURCE_1),
        ]
        reference_text = get_nodes_text(target_reference_content_nodes)
        target_jats_xml = etree.tostring(
            get_target_xml_node(reference_nodes=[
                get_jats_reference_node(LABEL_1, *
                                        target_reference_content_nodes),
            ]))
        LOGGER.debug('target_jats_xml: %s', target_jats_xml)
        test_helper.xml_file_path.write_bytes(target_jats_xml)
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_reference_segmenter_tei_node(
                    [E.note(LABEL_1 + ' ' + reference_text)])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'matcher': 'simple',
            'fields': 'reference'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text(tei_auto_root, '//listBibl/bibl') == ' '.join(
            [LABEL_1, reference_text])
    def test_should_auto_annotate_title(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(get_header_tei_node([E.note(TEXT_1)])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(get_target_xml_node(title=TEXT_1)))
        main([*test_helper.main_args], save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text(tei_auto_root, '//docTitle/titlePart') == TEXT_1
    def test_should_extend_title_annotation_to_whole_line(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        title_text = 'Chocolate bars for mice'
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_header_tei_node([E.note('Title: ' + title_text)])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(get_target_xml_node(title=title_text)))
        main([*test_helper.main_args, '--matcher=simple'],
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text(tei_auto_root,
                              '//docTitle/titlePart') == title_text
    def test_should_skip_errors(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        tei_raw_other_file_path = test_helper.tei_raw_path.joinpath(
            'document0.header.tei.xml')
        tei_raw_other_file_path.write_bytes(
            etree.tostring(get_header_tei_node([E.note(TEXT_1)])))
        xml_other_file_path = test_helper.xml_path.joinpath('document0.xml')
        xml_other_file_path.write_bytes(
            etree.tostring(get_target_xml_node(title=TEXT_1)) + b'error')
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(get_header_tei_node([E.note(TEXT_1)])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(get_target_xml_node(title=TEXT_1)))
        main([*test_helper.main_args, '--matcher=simple', '--skip-errors'],
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text(tei_auto_root, '//docTitle/titlePart') == TEXT_1
Example #9
0
    def test_should_auto_annotate_single_reference(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_reference_segmenter_tei_node(
                    [E.note(LABEL_1 + ' ' + REFERENCE_TEXT_1)])))
        test_helper.xml_file_path.write_bytes(
            etree.tostring(
                get_target_xml_node(reference_nodes=[
                    get_jats_reference_node(LABEL_1, REFERENCE_TEXT_1),
                ])))
        main(dict_to_args({
            **test_helper.main_args_dict, 'matcher': 'simple',
            'fields': 'reference'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_xpath_text(tei_auto_root, '//listBibl/bibl') == ' '.join(
            [LABEL_1, REFERENCE_TEXT_1])
Example #10
0
def get_tei_xpath_text(*args, **kwargs):
    return get_xpath_text(*args, namespaces=TEI_NS_MAP, **kwargs)