def test_should_replace_pmid_with_too_many_digits_text(self): original_ref = get_jats_mixed_ref( 'PMID: ', get_jats_pmid_element('WOS: ' + WOS_1)) fixed_ref = fix_reference(clone_node(original_ref)) fixed_pmid = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.PMID))) assert fixed_pmid == '' fixed_other = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.OTHER_PUB_ID))) assert fixed_other == WOS_1
def test_should_not_include_doi_colon_in_pii(self): original_ref = get_jats_mixed_ref( 'doi:', get_jats_doi_element(PII_1 + ' [pii]; ' + DOI_1 + ' [doi]')) fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) fixed_pii = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.PII))) assert fixed_doi == DOI_1 assert fixed_pii == PII_1
def test_should_separately_annotate_pii_with_preceding_element(self): original_ref = get_jats_mixed_ref( E.other('other text'), 'doi: ', get_jats_doi_element(PII_1 + ' [pii]; ' + DOI_1 + ' [doi]')) fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) fixed_pii = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.PII))) assert fixed_doi == DOI_1 assert fixed_pii == PII_1
def test_should_separately_annotate_invalid_pii_as_other_pub_id(self): original_ref = get_jats_mixed_ref( 'doi: ', get_jats_doi_element(INVALID_PII_1 + ' [pii]; ' + DOI_1 + ' [doi]')) fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) other_pub_id = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.OTHER_PUB_ID))) assert fixed_doi == DOI_1 assert other_pub_id == INVALID_PII_1
def test_should_remove_doi_prefix_from_doi(self): original_ref = get_jats_mixed_ref('some text', get_jats_doi_element('doi:' + DOI_1)) fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) assert fixed_doi == DOI_1
def test_should_separately_annotate_pmid_with_preceding_element(self): original_ref = get_jats_mixed_ref(E.other('other text'), 'PMID:' + PMID_1) fixed_ref = fix_reference(clone_node(original_ref)) fixed_pmid = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.PMID))) assert fixed_pmid == PMID_1
def test_should_remove_pmcid_from_article_title(self): original_ref = get_jats_mixed_ref( 'title: ', E('article-title', ARTICLE_TITLE_1 + '; ' + PMCID_1)) fixed_ref = fix_reference(clone_node(original_ref)) fixed_article_title = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.ARTICLE_TITLE))) assert fixed_article_title == ARTICLE_TITLE_1
def test_should_remove_duplicate_doi_with_tail(self): original_ref = get_jats_mixed_ref( 'doi: ', get_jats_doi_element(DOI_1 + '; ' + DOI_1), 'tail text') fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) assert fixed_doi == DOI_1
def test_should_not_preserve_original_affiliation_annotation( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): # we only create a single jats affiliation that would usually change the tei affiliation # with --segment-affiliation, we expect to the affiliation segmentation to be updated prefix = 'Some affiliation' jats_text = prefix + '.' tei_text = prefix + ' .' target_jats_xml = etree.tostring( get_target_xml_node(affiliation_nodes=[ E.aff(jats_text, jats_text), ])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node( [TEI_E.affiliation(tei_text), TEI_E.affiliation(tei_text)]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'fields': 'author_aff', 'segment-affiliation': True }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_text_content_list( get_all_affiliations(tei_auto_root)) == [tei_text + tei_text]
def test_should_remove_doi_suffix_from_doi_without_tail(self): original_ref = get_jats_mixed_ref( 'doi: ', get_jats_doi_element(DOI_1 + ' [doi]')) fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) assert fixed_doi == DOI_1
def test_should_remove_double_pmc_prefix_from_pmcid(self): original_ref = get_jats_mixed_ref( 'PMCID: ', get_jats_pmcid_element('PMC' + PMCID_1)) fixed_ref = fix_reference(clone_node(original_ref)) fixed_pmcid = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.PMCID))) assert fixed_pmcid == PMCID_1
def test_should_convert_pub_id_type_to_lower_case(self): original_ref = get_jats_mixed_ref( get_jats_pub_id_element(PMCID_1, 'PMCID')) fixed_ref = fix_reference(clone_node(original_ref)) fixed_pmcid = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.PMCID))) assert fixed_pmcid == PMCID_1
def test_should_not_include_pubmed_prefix_in_doi(self): original_ref = get_jats_mixed_ref('doi: ', DOI_1, '. PubMed PMID: ', PMID_1) fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) assert fixed_doi == DOI_1
def test_should_not_merge_multiple_affiliation_annotations_surrounded_by_line_feeds( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper, segment_affiliation: bool): aff1_text = '\nSome affiliation 1\n' aff2_text = '\nSome affiliation 2\n' target_jats_xml = etree.tostring( get_target_xml_node( affiliation_nodes=[E.aff(aff1_text), E.aff(aff2_text)])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node([ TEI_E.affiliation(aff1_text), TEI_E.affiliation(aff2_text) ]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'segment-affiliation': segment_affiliation, 'fields': 'author_aff' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert [ s.strip() for s in get_text_content_list(get_all_affiliations(tei_auto_root)) ] == [aff1_text.strip(), aff2_text.strip()]
def test_should_remove_pmid_non_digit_text(self): original_ref = get_jats_mixed_ref( get_jats_pmid_element('PMID: ' + PMID_1)) fixed_ref = fix_reference(clone_node(original_ref)) fixed_pmid = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.PMID))) assert fixed_pmid == PMID_1
def test_should_keep_original_pmid_if_already_present_and_valid(self): original_ref = get_jats_mixed_ref(get_jats_pmid_element(PMID_1), ', alternative PMID: 123') fixed_ref = fix_reference(clone_node(original_ref)) fixed_pmid = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.PMID))) assert fixed_pmid == PMID_1
def test_should_convert_doi_with_outside_url_prefix_to_ext_link(self): original_ref = get_jats_mixed_ref('some text ' + HTTPS_DOI_URL_PREFIX, get_jats_doi_element(DOI_1), 'tail text') fixed_ref = fix_reference(clone_node(original_ref)) ext_link_text = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.EXT_LINK))) assert ext_link_text == HTTPS_DOI_URL_PREFIX + DOI_1
def test_should_remove_doi_prefix_after_preceeding_element_with_tail_text( self): original_ref = get_jats_mixed_ref(E.other('other text'), 'tail text', get_jats_doi_element('doi:' + DOI_1)) fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) assert fixed_doi == DOI_1
def test_should_not_remove_other_square_brackets_from_ext_link(self): url = HTTPS_DOI_URL_PREFIX + DOI_1 + '[other]' original_ref = get_jats_mixed_ref(get_jats_ext_link_element(url)) fixed_ref = fix_reference(clone_node(original_ref)) fixed_ext_links = fixed_ref.xpath(JatsXpaths.EXT_LINK) fixed_ext_link = '|'.join(get_text_content_list(fixed_ext_links)) assert fixed_ext_link == url assert fixed_ext_links[0].attrib[XLINK_HREF] == url
def test_should_create_separate_author_node(self): xml_root = extracted_items_to_xml([ ExtractedItem(Tags.AUTHOR, TEXT_1), ExtractedItem(Tags.AUTHOR, TEXT_2) ]) assert xml_root is not None assert get_text_content_list(xml_root.findall( XmlPaths.AUTHOR)) == [TEXT_1, TEXT_2]
def test_should_remove_doi_pub_id_element_if_not_containing_valid_doi( self): original_ref = get_jats_mixed_ref('doi: ', get_jats_doi_element('not a doi')) fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) assert fixed_doi == ''
def test_should_remove_quotes_and_trailing_comma_from_article_title(self): original_ref = get_jats_mixed_ref( 'title: ', E('article-title', SpecialChars.LDQUO, E.italic(ARTICLE_TITLE_1), ', '), SpecialChars.RDQUO) fixed_ref = fix_reference(clone_node(original_ref)) fixed_article_title = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.ARTICLE_TITLE))) assert fixed_article_title == ARTICLE_TITLE_1
def test_should_remove_left_right_single_quotes_from_article_title(self): original_ref = get_jats_mixed_ref( 'title: ', E('article-title', SpecialChars.LSQUO + ARTICLE_TITLE_1 + SpecialChars.RSQUO)) fixed_ref = fix_reference(clone_node(original_ref)) fixed_article_title = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.ARTICLE_TITLE))) assert fixed_article_title == ARTICLE_TITLE_1
def test_should_split_ext_link_containing_multiple_http_links(self): url_1 = HTTP_DOI_URL_PREFIX + DOI_1 url_2 = HTTP_DOI_URL_PREFIX + DOI_2 original_ref = get_jats_mixed_ref( get_jats_ext_link_element(url_1 + url_2)) fixed_ref = fix_reference(clone_node(original_ref)) fixed_ext_links = fixed_ref.xpath(JatsXpaths.EXT_LINK) fixed_ext_link_urls = get_text_content_list(fixed_ext_links) assert fixed_ext_link_urls == [url_1, url_2]
def test_should_remove_duplicate_doi_ignoring_punct_with_tail(self): doi_1_a = DOI_1 + '.ab-123' doi_1_b = DOI_1 + '.ab.123' original_ref = get_jats_mixed_ref( 'doi: ', get_jats_doi_element(doi_1_a + '; ' + doi_1_b), 'tail text') fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) assert fixed_doi == doi_1_a
def test_should_remove_double_doi_in_ext_link_square_brackets(self): original_ref = get_jats_mixed_ref( get_jats_ext_link_element(HTTPS_DOI_URL_PREFIX + DOI_1 + '[' + DOI_1 + ']')) fixed_ref = fix_reference(clone_node(original_ref)) fixed_ext_links = fixed_ref.xpath(JatsXpaths.EXT_LINK) fixed_ext_link = '|'.join(get_text_content_list(fixed_ext_links)) assert fixed_ext_link == HTTPS_DOI_URL_PREFIX + DOI_1 assert fixed_ext_links[0].attrib[ XLINK_HREF] == HTTPS_DOI_URL_PREFIX + DOI_1
def test_should_remove_doi_duplicate_pii_suffix_from_doi_with_tail(self): doi_fragment_duplicate = 'doi-duplicate' doi = DOI_1 + '.' + doi_fragment_duplicate original_ref = get_jats_mixed_ref( 'doi: ', get_jats_doi_element(doi + ' ' + doi_fragment_duplicate + ' [pii]'), 'tail text') fixed_ref = fix_reference(clone_node(original_ref)) fixed_doi = '|'.join( get_text_content_list(fixed_ref.xpath(JatsXpaths.DOI))) assert fixed_doi == doi
def test_should_convert_doi_with_outside_spaced_url_prefix_to_ext_link( self): original_ref = get_jats_mixed_ref( 'some text ' + HTTPS_SPACED_DOI_URL_PREFIX, get_jats_doi_element(DOI_1), 'tail text') fixed_ref = fix_reference(clone_node(original_ref)) ext_links = fixed_ref.xpath(JatsXpaths.EXT_LINK) ext_link_text = '|'.join(get_text_content_list(ext_links)) assert ext_link_text == HTTPS_SPACED_DOI_URL_PREFIX + DOI_1 assert ext_links[0].attrib == { 'ext-link-type': 'uri', XLINK_HREF: HTTPS_DOI_URL_PREFIX + DOI_1 }
def test_should_fix_jats_xml_using_source_path(self, input_dir: Path, output_dir: Path): original_ref = get_jats_mixed_ref('doi: ', get_jats_doi_element('doi:' + DOI_1)) input_file = input_dir / 'file1.xml' input_file.parent.mkdir() input_file.write_bytes( etree.tostring(get_jats(references=[original_ref]))) output_file = output_dir / 'file1.xml' main( ['--source-path=%s' % input_file, '--output-path=%s' % output_dir]) assert output_file.exists() fixed_root = parse_xml(str(output_file)) fixed_doi = '|'.join( get_text_content_list(fixed_root.xpath(JatsXpaths.DOI))) assert fixed_doi == DOI_1
def test_should_segment_figures_if_enabled( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_figure_content_nodes_1 = [ E.label(LABEL_1), ' ', E.caption(E.p(TEXT_1)) ] target_figure_content_nodes_2 = [ E.label(LABEL_2), ' ', E.caption(E.p(TEXT_2)) ] target_jats_xml = etree.tostring( get_target_xml_node(body_nodes=[ E.fig(*target_figure_content_nodes_1), E.fig(*target_figure_content_nodes_2) ]) ) test_helper.tei_raw_file_path.write_bytes(etree.tostring( get_training_tei_node([ E.figure(get_nodes_text( target_figure_content_nodes_1 + [' '] + target_figure_content_nodes_2 )) ]) )) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'segment-figures': True, 'fields': 'figure' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_text_content_list(get_all_figures(tei_auto_root)) == [ get_nodes_text(target_figure_content_nodes_1), get_nodes_text(target_figure_content_nodes_2) ]