def test_should_replace_pmid_with_too_many_digits_text(self):
     original_ref = get_jats_mixed_ref(
         'PMID: ', get_jats_pmid_element('WOS: ' + WOS_1))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_pmid = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.PMID)))
     assert fixed_pmid == ''
     fixed_other = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.OTHER_PUB_ID)))
     assert fixed_other == WOS_1
 def test_should_not_include_doi_colon_in_pii(self):
     original_ref = get_jats_mixed_ref(
         'doi:',
         get_jats_doi_element(PII_1 + ' [pii]; ' + DOI_1 + ' [doi]'))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     fixed_pii = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.PII)))
     assert fixed_doi == DOI_1
     assert fixed_pii == PII_1
 def test_should_separately_annotate_pii_with_preceding_element(self):
     original_ref = get_jats_mixed_ref(
         E.other('other text'), 'doi: ',
         get_jats_doi_element(PII_1 + ' [pii]; ' + DOI_1 + ' [doi]'))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     fixed_pii = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.PII)))
     assert fixed_doi == DOI_1
     assert fixed_pii == PII_1
 def test_should_separately_annotate_invalid_pii_as_other_pub_id(self):
     original_ref = get_jats_mixed_ref(
         'doi: ',
         get_jats_doi_element(INVALID_PII_1 + ' [pii]; ' + DOI_1 +
                              ' [doi]'))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     other_pub_id = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.OTHER_PUB_ID)))
     assert fixed_doi == DOI_1
     assert other_pub_id == INVALID_PII_1
 def test_should_remove_doi_prefix_from_doi(self):
     original_ref = get_jats_mixed_ref('some text',
                                       get_jats_doi_element('doi:' + DOI_1))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     assert fixed_doi == DOI_1
 def test_should_separately_annotate_pmid_with_preceding_element(self):
     original_ref = get_jats_mixed_ref(E.other('other text'),
                                       'PMID:' + PMID_1)
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_pmid = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.PMID)))
     assert fixed_pmid == PMID_1
 def test_should_remove_pmcid_from_article_title(self):
     original_ref = get_jats_mixed_ref(
         'title: ', E('article-title', ARTICLE_TITLE_1 + '; ' + PMCID_1))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_article_title = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.ARTICLE_TITLE)))
     assert fixed_article_title == ARTICLE_TITLE_1
 def test_should_remove_duplicate_doi_with_tail(self):
     original_ref = get_jats_mixed_ref(
         'doi: ', get_jats_doi_element(DOI_1 + '; ' + DOI_1), 'tail text')
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     assert fixed_doi == DOI_1
    def test_should_not_preserve_original_affiliation_annotation(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        # we only create a single jats affiliation that would usually change the tei affiliation
        # with --segment-affiliation, we expect to the affiliation segmentation to be updated
        prefix = 'Some affiliation'
        jats_text = prefix + '.'
        tei_text = prefix + ' .'
        target_jats_xml = etree.tostring(
            get_target_xml_node(affiliation_nodes=[
                E.aff(jats_text, jats_text),
            ]))
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_affiliation_tei_node(
                    [TEI_E.affiliation(tei_text),
                     TEI_E.affiliation(tei_text)])))
        LOGGER.debug('target_jats_xml: %s', target_jats_xml)
        test_helper.xml_file_path.write_bytes(target_jats_xml)
        main(dict_to_args({
            **test_helper.main_args_dict, 'matcher': 'simple',
            'fields': 'author_aff',
            'segment-affiliation': True
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_text_content_list(
            get_all_affiliations(tei_auto_root)) == [tei_text + tei_text]
 def test_should_remove_doi_suffix_from_doi_without_tail(self):
     original_ref = get_jats_mixed_ref(
         'doi: ', get_jats_doi_element(DOI_1 + ' [doi]'))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     assert fixed_doi == DOI_1
 def test_should_remove_double_pmc_prefix_from_pmcid(self):
     original_ref = get_jats_mixed_ref(
         'PMCID: ', get_jats_pmcid_element('PMC' + PMCID_1))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_pmcid = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.PMCID)))
     assert fixed_pmcid == PMCID_1
 def test_should_convert_pub_id_type_to_lower_case(self):
     original_ref = get_jats_mixed_ref(
         get_jats_pub_id_element(PMCID_1, 'PMCID'))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_pmcid = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.PMCID)))
     assert fixed_pmcid == PMCID_1
 def test_should_not_include_pubmed_prefix_in_doi(self):
     original_ref = get_jats_mixed_ref('doi: ', DOI_1, '. PubMed PMID: ',
                                       PMID_1)
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     assert fixed_doi == DOI_1
    def test_should_not_merge_multiple_affiliation_annotations_surrounded_by_line_feeds(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper,
            segment_affiliation: bool):
        aff1_text = '\nSome affiliation 1\n'
        aff2_text = '\nSome affiliation 2\n'
        target_jats_xml = etree.tostring(
            get_target_xml_node(
                affiliation_nodes=[E.aff(aff1_text),
                                   E.aff(aff2_text)]))
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_affiliation_tei_node([
                    TEI_E.affiliation(aff1_text),
                    TEI_E.affiliation(aff2_text)
                ])))
        LOGGER.debug('target_jats_xml: %s', target_jats_xml)
        test_helper.xml_file_path.write_bytes(target_jats_xml)
        main(dict_to_args({
            **test_helper.main_args_dict, 'matcher': 'simple',
            'segment-affiliation': segment_affiliation,
            'fields': 'author_aff'
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert [
            s.strip()
            for s in get_text_content_list(get_all_affiliations(tei_auto_root))
        ] == [aff1_text.strip(), aff2_text.strip()]
 def test_should_remove_pmid_non_digit_text(self):
     original_ref = get_jats_mixed_ref(
         get_jats_pmid_element('PMID: ' + PMID_1))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_pmid = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.PMID)))
     assert fixed_pmid == PMID_1
 def test_should_keep_original_pmid_if_already_present_and_valid(self):
     original_ref = get_jats_mixed_ref(get_jats_pmid_element(PMID_1),
                                       ', alternative PMID: 123')
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_pmid = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.PMID)))
     assert fixed_pmid == PMID_1
 def test_should_convert_doi_with_outside_url_prefix_to_ext_link(self):
     original_ref = get_jats_mixed_ref('some text ' + HTTPS_DOI_URL_PREFIX,
                                       get_jats_doi_element(DOI_1),
                                       'tail text')
     fixed_ref = fix_reference(clone_node(original_ref))
     ext_link_text = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.EXT_LINK)))
     assert ext_link_text == HTTPS_DOI_URL_PREFIX + DOI_1
 def test_should_remove_doi_prefix_after_preceeding_element_with_tail_text(
         self):
     original_ref = get_jats_mixed_ref(E.other('other text'), 'tail text',
                                       get_jats_doi_element('doi:' + DOI_1))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     assert fixed_doi == DOI_1
 def test_should_not_remove_other_square_brackets_from_ext_link(self):
     url = HTTPS_DOI_URL_PREFIX + DOI_1 + '[other]'
     original_ref = get_jats_mixed_ref(get_jats_ext_link_element(url))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_ext_links = fixed_ref.xpath(JatsXpaths.EXT_LINK)
     fixed_ext_link = '|'.join(get_text_content_list(fixed_ext_links))
     assert fixed_ext_link == url
     assert fixed_ext_links[0].attrib[XLINK_HREF] == url
コード例 #20
0
 def test_should_create_separate_author_node(self):
     xml_root = extracted_items_to_xml([
         ExtractedItem(Tags.AUTHOR, TEXT_1),
         ExtractedItem(Tags.AUTHOR, TEXT_2)
     ])
     assert xml_root is not None
     assert get_text_content_list(xml_root.findall(
         XmlPaths.AUTHOR)) == [TEXT_1, TEXT_2]
 def test_should_remove_doi_pub_id_element_if_not_containing_valid_doi(
         self):
     original_ref = get_jats_mixed_ref('doi: ',
                                       get_jats_doi_element('not a doi'))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     assert fixed_doi == ''
 def test_should_remove_quotes_and_trailing_comma_from_article_title(self):
     original_ref = get_jats_mixed_ref(
         'title: ',
         E('article-title', SpecialChars.LDQUO, E.italic(ARTICLE_TITLE_1),
           ', '), SpecialChars.RDQUO)
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_article_title = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.ARTICLE_TITLE)))
     assert fixed_article_title == ARTICLE_TITLE_1
 def test_should_remove_left_right_single_quotes_from_article_title(self):
     original_ref = get_jats_mixed_ref(
         'title: ',
         E('article-title',
           SpecialChars.LSQUO + ARTICLE_TITLE_1 + SpecialChars.RSQUO))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_article_title = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.ARTICLE_TITLE)))
     assert fixed_article_title == ARTICLE_TITLE_1
 def test_should_split_ext_link_containing_multiple_http_links(self):
     url_1 = HTTP_DOI_URL_PREFIX + DOI_1
     url_2 = HTTP_DOI_URL_PREFIX + DOI_2
     original_ref = get_jats_mixed_ref(
         get_jats_ext_link_element(url_1 + url_2))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_ext_links = fixed_ref.xpath(JatsXpaths.EXT_LINK)
     fixed_ext_link_urls = get_text_content_list(fixed_ext_links)
     assert fixed_ext_link_urls == [url_1, url_2]
 def test_should_remove_duplicate_doi_ignoring_punct_with_tail(self):
     doi_1_a = DOI_1 + '.ab-123'
     doi_1_b = DOI_1 + '.ab.123'
     original_ref = get_jats_mixed_ref(
         'doi: ', get_jats_doi_element(doi_1_a + '; ' + doi_1_b),
         'tail text')
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     assert fixed_doi == doi_1_a
 def test_should_remove_double_doi_in_ext_link_square_brackets(self):
     original_ref = get_jats_mixed_ref(
         get_jats_ext_link_element(HTTPS_DOI_URL_PREFIX + DOI_1 + '[' +
                                   DOI_1 + ']'))
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_ext_links = fixed_ref.xpath(JatsXpaths.EXT_LINK)
     fixed_ext_link = '|'.join(get_text_content_list(fixed_ext_links))
     assert fixed_ext_link == HTTPS_DOI_URL_PREFIX + DOI_1
     assert fixed_ext_links[0].attrib[
         XLINK_HREF] == HTTPS_DOI_URL_PREFIX + DOI_1
 def test_should_remove_doi_duplicate_pii_suffix_from_doi_with_tail(self):
     doi_fragment_duplicate = 'doi-duplicate'
     doi = DOI_1 + '.' + doi_fragment_duplicate
     original_ref = get_jats_mixed_ref(
         'doi: ',
         get_jats_doi_element(doi + '  ' + doi_fragment_duplicate +
                              ' [pii]'), 'tail text')
     fixed_ref = fix_reference(clone_node(original_ref))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI)))
     assert fixed_doi == doi
 def test_should_convert_doi_with_outside_spaced_url_prefix_to_ext_link(
         self):
     original_ref = get_jats_mixed_ref(
         'some text ' + HTTPS_SPACED_DOI_URL_PREFIX,
         get_jats_doi_element(DOI_1), 'tail text')
     fixed_ref = fix_reference(clone_node(original_ref))
     ext_links = fixed_ref.xpath(JatsXpaths.EXT_LINK)
     ext_link_text = '|'.join(get_text_content_list(ext_links))
     assert ext_link_text == HTTPS_SPACED_DOI_URL_PREFIX + DOI_1
     assert ext_links[0].attrib == {
         'ext-link-type': 'uri',
         XLINK_HREF: HTTPS_DOI_URL_PREFIX + DOI_1
     }
 def test_should_fix_jats_xml_using_source_path(self, input_dir: Path,
                                                output_dir: Path):
     original_ref = get_jats_mixed_ref('doi: ',
                                       get_jats_doi_element('doi:' + DOI_1))
     input_file = input_dir / 'file1.xml'
     input_file.parent.mkdir()
     input_file.write_bytes(
         etree.tostring(get_jats(references=[original_ref])))
     output_file = output_dir / 'file1.xml'
     main(
         ['--source-path=%s' % input_file,
          '--output-path=%s' % output_dir])
     assert output_file.exists()
     fixed_root = parse_xml(str(output_file))
     fixed_doi = '|'.join(
         get_text_content_list(fixed_root.xpath(JatsXpaths.DOI)))
     assert fixed_doi == DOI_1
コード例 #30
0
    def test_should_segment_figures_if_enabled(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        target_figure_content_nodes_1 = [
            E.label(LABEL_1),
            ' ',
            E.caption(E.p(TEXT_1))
        ]
        target_figure_content_nodes_2 = [
            E.label(LABEL_2),
            ' ',
            E.caption(E.p(TEXT_2))
        ]
        target_jats_xml = etree.tostring(
            get_target_xml_node(body_nodes=[
                E.fig(*target_figure_content_nodes_1),
                E.fig(*target_figure_content_nodes_2)
            ])
        )
        test_helper.tei_raw_file_path.write_bytes(etree.tostring(
            get_training_tei_node([
                E.figure(get_nodes_text(
                    target_figure_content_nodes_1
                    + [' ']
                    + target_figure_content_nodes_2
                ))
            ])
        ))
        LOGGER.debug('target_jats_xml: %s', target_jats_xml)
        test_helper.xml_file_path.write_bytes(target_jats_xml)
        main(dict_to_args({
            **test_helper.main_args_dict,
            'matcher': 'simple',
            'segment-figures': True,
            'fields': 'figure'
        }), save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_text_content_list(get_all_figures(tei_auto_root)) == [
            get_nodes_text(target_figure_content_nodes_1),
            get_nodes_text(target_figure_content_nodes_2)
        ]