def test_should_not_include_line_feed_in_tag_if_previous_token_has_different_tag(
            self):
        original_root = _tei(front_items=[TOKEN_1, '\n ' + TOKEN_2])
        LOGGER.debug('original_root: %s', _to_xml(original_root))
        doc = GrobidTrainingTeiStructuredDocument(original_root)
        lines = _get_all_lines(doc)

        line1_tokens = list(doc.get_all_tokens_of_line(lines[0]))
        space_tokens = [t for t in line1_tokens if isinstance(t, TeiSpace)]
        assert space_tokens
        for token in space_tokens:
            doc.set_tag(token, TAG_1)
        doc.set_tag(line1_tokens[0], TAG_1)
        doc.set_tag(line1_tokens[-1], TAG_2)

        LOGGER.debug('line1_tokens: %s', line1_tokens)

        root = doc.root
        front = root.find('./text/front')
        LOGGER.debug('xml: %s', _to_xml(front))
        assert _to_xml(front) == (
            '<front><{tag1}>{token1}</{tag1}>'
            '\n <{tag2}>{token2}</{tag2}></front>'.format(tag1=TAG_1,
                                                          tag2=TAG_2,
                                                          token1=TOKEN_1,
                                                          token2=TOKEN_2))
 def test_should_set_empty_token_whitespace(self):
     doc = GrobidTrainingTeiStructuredDocument(
         _tei(front_items=[E.note('1.')]))
     lines = _get_all_lines(doc)
     assert _get_token_texts_for_lines(doc, lines) == [['1', '.']]
     tokens = doc.get_all_tokens_of_line(lines[0])
     assert tokens[0].whitespace == ''
    def test_should_reverse_map_tags(self):
        tag_to_tei_path_mapping = {TAG_1: 'docTitle/titlePart'}
        original_tei_xml = _tei(front_items=[E.docTitle(E.titlePart(TOKEN_1))])
        LOGGER.debug('original tei xml: %s', _to_xml(original_tei_xml))
        doc = GrobidTrainingTeiStructuredDocument(
            original_tei_xml,
            tag_to_tei_path_mapping=tag_to_tei_path_mapping,
            preserve_tags=True)
        LOGGER.debug('doc: %s', doc)

        assert [[
            doc.get_tag_or_preserved_tag_value(t)
            for t in doc.get_all_tokens_of_line(line)
        ] for line in _get_all_lines(doc)] == [[TAG_1]]