def fix_reference(ref: etree.Element) -> etree.Element:
    original_ref_text = get_text_content(ref)
    LOGGER.debug('ref xml (before): %s', etree.tostring(ref))
    fixed_ref = _fix_reference(ref)
    LOGGER.debug('ref xml (after): %s', etree.tostring(fixed_ref))
    assert get_text_content(fixed_ref) == original_ref_text
    return fixed_ref
 def test_should_remove_dot_after_other_special_characters(self):
     xml_root = extracted_items_to_xml(
         _create_author_extracted_items('Mr T*.', 'E*.'))
     assert xml_root is not None
     author = xml_root.find(XmlPaths.AUTHOR)
     assert author is not None
     assert get_text_content(author.find(
         SubXmlPaths.AUTHOR_GIVEN_NAMES)) == 'Mr T'
     assert get_text_content(author.find(SubXmlPaths.AUTHOR_SURNAME)) == 'E'
 def test_should_not_remove_dot_after_suffix_from_author(self):
     xml_root = extracted_items_to_xml(
         _create_author_extracted_items('Mr T.', 'Jr.'))
     assert xml_root is not None
     author = xml_root.find(XmlPaths.AUTHOR)
     assert author is not None
     assert get_text_content(author.find(
         SubXmlPaths.AUTHOR_GIVEN_NAMES)) == 'Mr T.'
     assert get_text_content(author.find(
         SubXmlPaths.AUTHOR_SURNAME)) == 'Jr.'
 def test_should_remove_special_characters_and_numbers_from_author(self):
     special_num_chars = ',+*0123456789'
     xml_root = extracted_items_to_xml(
         _create_author_extracted_items(TEXT_1 + special_num_chars,
                                        TEXT_2 + special_num_chars))
     assert xml_root is not None
     author = xml_root.find(XmlPaths.AUTHOR)
     assert author is not None
     assert get_text_content(author.find(
         SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1
     assert get_text_content(author.find(
         SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
    def test_should_remove_invalid_affiliation(
            self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper):
        # we only create a single jats affiliation that would usually change the tei affiliation
        # with --segment-affiliation, we expect to the affiliation segmentation to be updated
        prefix = 'Some affiliation'
        jats_text = prefix + '.'
        tei_text = prefix + ' .'
        invalid_affiliation_text = 'invalid affiliation'
        target_jats_xml = etree.tostring(
            get_target_xml_node(affiliation_nodes=[E.aff(jats_text)]))
        test_helper.tei_raw_file_path.write_bytes(
            etree.tostring(
                get_affiliation_tei_node([
                    TEI_E.affiliation(tei_text),
                    TEI_E.affiliation(invalid_affiliation_text)
                ])))
        LOGGER.debug('target_jats_xml: %s', target_jats_xml)
        test_helper.xml_file_path.write_bytes(target_jats_xml)
        main(dict_to_args({
            **test_helper.main_args_dict, 'matcher': 'simple',
            'fields': 'author_aff',
            'segment-affiliation': True,
            'remove-invalid-affiliations': True
        }),
             save_main_session=False)

        tei_auto_root = test_helper.get_tei_auto_root()
        assert get_text_content(tei_auto_root) == tei_text
 def test_should_append_to_abstract(self):
     xml_root = extracted_items_to_xml([
         ExtractedItem(Tags.ABSTRACT, TEXT_1),
         ExtractedItem(Tags.ABSTRACT, TEXT_2)
     ])
     assert xml_root is not None
     assert get_text_content(xml_root.find(XmlPaths.ABSTRACT)) == '\n'.join(
         [TEXT_1, TEXT_2])
 def test_should_extract_author_surname_and_given_names_from_single_author(
         self):
     xml_root = extracted_items_to_xml([
         ExtractedItem(Tags.AUTHOR,
                       ' '.join([TEXT_1, TEXT_2]),
                       sub_items=[
                           ExtractedItem(SubTags.AUTHOR_GIVEN_NAMES,
                                         TEXT_1),
                           ExtractedItem(SubTags.AUTHOR_SURNAME, TEXT_2)
                       ])
     ])
     assert xml_root is not None
     author = xml_root.find(XmlPaths.AUTHOR)
     assert author is not None
     assert get_text_content(author.find(
         SubXmlPaths.AUTHOR_GIVEN_NAMES)) == TEXT_1
     assert get_text_content(author.find(
         SubXmlPaths.AUTHOR_SURNAME)) == TEXT_2
 def test_should_not_append_to_abstract_after_another_tag_occured(self):
     xml_root = extracted_items_to_xml([
         ExtractedItem(Tags.ABSTRACT, TEXT_1),
         ExtractedItem(Tags.AUTHOR, TEXT_2),
         ExtractedItem(Tags.ABSTRACT, TEXT_3)
     ])
     assert xml_root is not None
     assert get_text_content(xml_root.find(XmlPaths.ABSTRACT)) == '\n'.join(
         [TEXT_1])
Esempio n. 9
0
def remove_training_comma_from_element(element: etree.Element):
    text = get_text_content(element)
    rstripped_text = text.rstrip(', ')
    if len(rstripped_text) == len(text):
        return
    children = list(element)
    if children and children[-1].tail:
        tail = children[-1].tail
        tail_end = max(0, len(tail) + len(rstripped_text) - len(text))
        add_text_to_tail_prefix(element, tail[tail_end:])
        children[-1].tail = tail[:tail_end]
Esempio n. 10
0
def remove_surrounding_quotes_from_element(element: etree.Element):
    text = get_text_content(element)
    if len(text) < 2:
        return
    children = list(element)
    if has_surrounding_quotes(text):
        if element.text:
            add_text_to_previous(element, element.text[:1])
            element.text = element.text[1:]
        if children and children[-1].tail:
            add_text_to_tail_prefix(element, children[-1].tail[-1:])
            children[-1].tail = children[-1].tail[:-1]
    elif text[0] in LEFT_QUOTE_CHARS:
        right_quote_char = RIGHT_BY_LEFT_QUOTE_CHAR[text[0]]
        if right_quote_char not in text[1:] and element.text:
            add_text_to_previous(element, element.text[:1])
            element.text = element.text[1:]
    def test_should_extract_from_simple_annotated_document(self):
        with TemporaryDirectory() as path:
            lxml_root = E.DOCUMENT(
                E.PAGE(E.TEXT(E.TOKEN(TEXT_1, {'tag': Tags.TITLE}))))

            lxml_path = os.path.join(path, 'test.lxml')
            with open(lxml_path, 'wb') as f:
                f.write(etree.tostring(lxml_root))

            output_path = os.path.join(path, 'test.xml')

            main([
                '--lxml-path=%s' % lxml_path,
                '--output-path=%s' % output_path
            ])

            xml_root = etree.parse(output_path)
            assert get_text_content(xml_root.find(XmlPaths.TITLE)) == TEXT_1
Esempio n. 12
0
 def process_request(self,
                     data: dict,
                     session: requests.Session,
                     context: dict = None):
     root = etree.fromstring(data['content'])
     matching_nodes = root.xpath(self._xpath)
     if not matching_nodes:
         LOGGER.info('xpath not matching any element: %s', self._xpath)
         return data
     for node in matching_nodes:
         value = get_text_content(node)
         LOGGER.debug('node for xpath %s: %s (text: %s)', self._xpath, node,
                      value)
         response = session.post(
             self._api_url,
             data=value.encode('utf-8'),
             timeout=self.get_default_request_timeout(context=context))
         response.raise_for_status()
         revised_value = response.text
         LOGGER.debug('revised_value: %s (was: %s)', revised_value, value)
         if revised_value != value:
             apply_revised_value(node, revised_value)
     return extend_dict(data, {'content': etree.tostring(root)})
Esempio n. 13
0
 def test_should_return_simple_text(self):
     node = E.parent(SOME_VALUE_1)
     assert get_text_content(node) == SOME_VALUE_1
 def test_should_populate_title(self):
     xml_root = extracted_items_to_xml([ExtractedItem(Tags.TITLE, TEXT_1)])
     assert xml_root is not None
     assert get_text_content(xml_root.find(XmlPaths.TITLE)) == TEXT_1
Esempio n. 15
0
 def test_should_return_text_of_child_element(self):
     node = E.parent(E.child(SOME_VALUE_1))
     assert get_text_content(node) == SOME_VALUE_1
Esempio n. 16
0
 def test_should_return_text_of_child_element_and_preceeding_text(self):
     node = E.parent(SOME_VALUE_1, E.child(SOME_VALUE_2))
     assert get_text_content(node) == SOME_VALUE_1 + SOME_VALUE_2
Esempio n. 17
0
 def test_should_return_text_of_child_element_and_trailing_text(self):
     node = E.parent(E.child(SOME_VALUE_1), SOME_VALUE_2)
     assert get_text_content(node) == SOME_VALUE_1 + SOME_VALUE_2
Esempio n. 18
0
def _extract_value_from_file(file_path, xpath, namespaces):
    root = _load_xml(file_path)
    return '\n'.join(get_text_content(node) for node in root.xpath(xpath, namespaces=namespaces))
def get_stripped_text_content(node, **kwargs):
    return strip_whitespace(get_text_content(node, **kwargs).strip())
Esempio n. 20
0
 def test_should_return_text_of_parent_excluding_children_to_exclude(self):
     child = E.child(SOME_VALUE_1)
     node = E.parent(child, SOME_VALUE_2)
     assert get_text_content(node, exclude=[child]) == SOME_VALUE_2
Esempio n. 21
0
def _get_text(xml, xpath):
    item = _get_item(xml, xpath)
    try:
        return get_text_content(item)
    except AttributeError:
        return text_type(item)
def get_node_text(node: Union[str, etree.ElementBase]) -> str:
    if isinstance(node, str):
        return str(node)
    return get_text_content(node)