def test_should_be_able_to_set_tag(self):
     doc = GrobidTrainingTeiStructuredDocument(
         _tei(front_items=[E.note(TOKEN_1)]))
     lines = _get_all_lines(doc)
     tokens = list(doc.get_tokens_of_line(lines[0]))
     token = tokens[0]
     doc.set_tag(token, TAG_1)
     assert doc.get_tag(token) == TAG_1
 def test_should_be_able_to_set_tag_with_attribute(self):
     doc = GrobidTrainingTeiStructuredDocument(
         _tei(front_items=[E.note(TOKEN_1)]),
         tag_to_tei_path_mapping={TAG_1: 'div[@tag="tag1"]'})
     lines = _get_all_lines(doc)
     tokens = list(doc.get_tokens_of_line(lines[0]))
     token = tokens[0]
     doc.set_tag(token, TAG_1)
     assert doc.get_tag(token) == TAG_1
Esempio n. 3
0
def _get_document_tagged_token_lines(
        doc: GrobidTrainingTeiStructuredDocument
) -> List[List[Tuple[str, str]]]:
    document_tagged_token_lines = [[(doc.get_tag(token), doc.get_text(token))
                                    for token in doc.get_tokens_of_line(line)]
                                   for page in doc.get_pages()
                                   for line in doc.get_lines_of_page(page)]
    LOGGER.debug('document_tagged_token_lines: %s',
                 document_tagged_token_lines)
    return document_tagged_token_lines
    def test_should_not_return_preserved_tag_as_tag_and_update_preserved_tag(
            self):
        original_tei_xml = _tei(front_items=[E.note(TOKEN_1)])
        LOGGER.debug('original tei xml: %s', _to_xml(original_tei_xml))
        doc = GrobidTrainingTeiStructuredDocument(original_tei_xml,
                                                  preserve_tags=True,
                                                  tag_to_tei_path_mapping={})
        LOGGER.debug('doc: %s', doc)
        lines = _get_all_lines(doc)
        token1 = list(doc.get_tokens_of_line(lines[0]))[0]
        assert not doc.get_tag(token1)
        doc.set_tag(token1, TAG_1)

        root = doc.root
        front = root.find('./text/front')
        LOGGER.debug('xml: %s', _to_xml(front))
        assert _to_xml(front) == (
            '<front><{tag1}>{token1}</{tag1}></front>'.format(token1=TOKEN_1,
                                                              tag1=TAG_1))