Example #1
0
 def annotate(self,
              structured_document: GrobidTrainingTeiStructuredDocument):
     # There is currently no support for more than two level tagging,
     # which would allow the a parent level to be represented.
     # As a workaround we are adding the a separate parent tag, to the tokens without tag.
     # That way those tokens will share a common parent element in the output.
     tag_level = self.config.tag_level
     all_tokens_iterable = iter_all_tokens_excluding_space(
         structured_document)
     unmatched_tags = set()
     current_group_tag = None
     for token in all_tokens_iterable:
         if tag_level:
             # if we looking at sub tags, then only consider tokens with a tag
             if not structured_document.get_tag_or_preserved_tag(token):
                 continue
         tag = structured_document.get_tag_or_preserved_tag_value(
             token, level=tag_level)
         if tag:
             current_group_tag = self.config.get_group_tag_for_tag_fn(tag)
             if not current_group_tag:
                 unmatched_tags.add(tag)
             continue
         if not current_group_tag:
             continue
         structured_document.set_tag_only(token,
                                          add_tag_prefix(
                                              current_group_tag,
                                              prefix=I_TAG_PREFIX),
                                          level=tag_level)
         LOGGER.debug('updated group token (%r): %s', current_group_tag,
                      token)
     LOGGER.debug('ignored unmatched tags: %s', unmatched_tags)
     return structured_document
Example #2
0
def get_line_number_candidates(
        structured_document: GrobidTrainingTeiStructuredDocument, tokens: list,
        min_line_number: int, max_line_number_gap: int):
    line_number_candidates_with_num = [
        (token, parse_line_number(structured_document.get_text(token)),
         1 + index) for index, token in enumerate(tokens)
        if is_line_number(structured_document.get_text(token))
    ]
    if not line_number_candidates_with_num:
        return []
    line_number_candidates_with_num = sorted(line_number_candidates_with_num,
                                             key=lambda item:
                                             (item[1], item[2]))
    line_number_sequences = [[line_number_candidates_with_num[0]]]
    for item in line_number_candidates_with_num[1:]:
        token, num, token_pos = item
        prev_seq = line_number_sequences[-1]
        prev_item = prev_seq[-1]
        _, prev_num, prev_token_pos = prev_item
        expected_num = prev_num + 1
        if token_pos < prev_token_pos or num == prev_num:
            LOGGER.debug('ignoring out of sequence: %s (prev: %s)', item,
                         prev_item)
        elif expected_num <= num <= expected_num + max_line_number_gap:
            prev_seq.append(item)
        else:
            line_number_sequences.append([item])
    accepted_line_number_sequences = [
        seq for seq in line_number_sequences if len(seq) >= min_line_number
    ]
    return [
        token for seq in accepted_line_number_sequences for token, _, _ in seq
    ]
 def annotate(self, structured_document: GrobidTrainingTeiStructuredDocument):
     all_tokens_iterable = iter_all_tokens_excluding_space(structured_document)
     previous_untagged_tokens = []
     ignored_token_tag_values = set()
     for token in all_tokens_iterable:
         tag_value = structured_document.get_tag_or_preserved_tag_value(token)
         if not tag_value:
             previous_untagged_tokens.append(token)
             continue
         if not previous_untagged_tokens:
             continue
         if tag_value not in self.config.enabled_tags:
             LOGGER.debug(
                 'ignoring untagged tokens (before %r): %s',
                 tag_value, previous_untagged_tokens
             )
             ignored_token_tag_values.add(tag_value)
             previous_untagged_tokens.clear()
             continue
         LOGGER.debug(
             'updated untagged tokens (before %r): %s',
             tag_value, previous_untagged_tokens
         )
         for index, previous_untagged_token in enumerate(previous_untagged_tokens):
             structured_document.set_tag_only(
                 previous_untagged_token,
                 add_tag_prefix(tag_value, B_TAG_PREFIX if index == 0 else I_TAG_PREFIX)
             )
         structured_document.set_tag_only(
             token,
             add_tag_prefix(tag_value, I_TAG_PREFIX)
         )
         previous_untagged_tokens.clear()
     LOGGER.debug('ignore not enabled tag values: %s', ignored_token_tag_values)
     return structured_document
 def test_should_set_empty_token_whitespace(self):
     doc = GrobidTrainingTeiStructuredDocument(
         _tei(front_items=[E.note('1.')]))
     lines = _get_all_lines(doc)
     assert _get_token_texts_for_lines(doc, lines) == [['1', '.']]
     tokens = doc.get_all_tokens_of_line(lines[0])
     assert tokens[0].whitespace == ''
 def annotate(self, structured_document: GrobidTrainingTeiStructuredDocument):
     all_tokens_iterable = iter_all_tokens_excluding_space(structured_document)
     previous_enabled_tag_value: Optional[str] = None
     previous_included_tokens: List[Any] = []
     ignored_token_tag_values = set()
     for token in all_tokens_iterable:
         tag_value = structured_document.get_tag_or_preserved_tag_value(token)
         if tag_value:
             _log_previous_included_tokens(
                 previous_enabled_tag_value, previous_included_tokens
             )
             previous_included_tokens.clear()
             previous_enabled_tag_value = (
                 tag_value
                 if tag_value in self.config.enabled_tags
                 else None
             )
             if not previous_enabled_tag_value:
                 ignored_token_tag_values.add(tag_value)
             continue
         if not previous_enabled_tag_value:
             continue
         structured_document.set_tag_only(
             token,
             add_tag_prefix(previous_enabled_tag_value, I_TAG_PREFIX)
         )
         previous_included_tokens.append(token)
     _log_previous_included_tokens(
         previous_enabled_tag_value, previous_included_tokens
     )
     LOGGER.debug('ignore not enabled tag values: %s', ignored_token_tag_values)
     return structured_document
Example #6
0
def iter_find_line_number_tokens(
        structured_document: GrobidTrainingTeiStructuredDocument, **kwargs):

    for page in structured_document.get_pages():
        line_no_tokens = iter_find_line_number_tokens_in_lines(
            structured_document,
            lines=structured_document.get_lines_of_page(page),
            **kwargs)
        yield from line_no_tokens
Example #7
0
def _get_document_tagged_token_lines(
        doc: GrobidTrainingTeiStructuredDocument
) -> List[List[Tuple[str, str]]]:
    document_tagged_token_lines = [[(doc.get_tag(token), doc.get_text(token))
                                    for token in doc.get_tokens_of_line(line)]
                                   for page in doc.get_pages()
                                   for line in doc.get_lines_of_page(page)]
    LOGGER.debug('document_tagged_token_lines: %s',
                 document_tagged_token_lines)
    return document_tagged_token_lines
    def test_should_not_include_line_feed_in_tag_if_previous_token_has_different_tag(
            self):
        original_root = _tei(front_items=[TOKEN_1, '\n ' + TOKEN_2])
        LOGGER.debug('original_root: %s', _to_xml(original_root))
        doc = GrobidTrainingTeiStructuredDocument(original_root)
        lines = _get_all_lines(doc)

        line1_tokens = list(doc.get_all_tokens_of_line(lines[0]))
        space_tokens = [t for t in line1_tokens if isinstance(t, TeiSpace)]
        assert space_tokens
        for token in space_tokens:
            doc.set_tag(token, TAG_1)
        doc.set_tag(line1_tokens[0], TAG_1)
        doc.set_tag(line1_tokens[-1], TAG_2)

        LOGGER.debug('line1_tokens: %s', line1_tokens)

        root = doc.root
        front = root.find('./text/front')
        LOGGER.debug('xml: %s', _to_xml(front))
        assert _to_xml(front) == (
            '<front><{tag1}>{token1}</{tag1}>'
            '\n <{tag2}>{token2}</{tag2}></front>'.format(tag1=TAG_1,
                                                          tag2=TAG_2,
                                                          token1=TOKEN_1,
                                                          token2=TOKEN_2))
Example #9
0
 def annotate(
     self, structured_document: GrobidTrainingTeiStructuredDocument
 ) -> GrobidTrainingTeiStructuredDocument:
     line_number_tokens = iter_find_line_number_tokens(
         structured_document,
         min_line_number=self.config.min_line_number,
         max_line_number_gap=self.config.max_line_number_gap,
         line_number_ratio_threshold=self.config.line_number_ratio_threshold
     )
     for t in line_number_tokens:
         structured_document.set_tag(t, self.config.tag)
     return structured_document
 def test_should_be_able_get_root_with_updated_single_token_tag(self):
     doc = GrobidTrainingTeiStructuredDocument(
         _tei(front_items=[E.note(TOKEN_1)]))
     lines = _get_all_lines(doc)
     tokens = list(doc.get_tokens_of_line(lines[0]))
     token = tokens[0]
     doc.set_tag(token, TAG_1)
     root = doc.root
     front = root.find('./text/front')
     child_elements = list(front)
     assert [c.tag for c in child_elements] == [TAG_1]
     assert [c.text for c in child_elements] == [TOKEN_1]
    def test_should_reverse_map_tags(self):
        tag_to_tei_path_mapping = {TAG_1: 'docTitle/titlePart'}
        original_tei_xml = _tei(front_items=[E.docTitle(E.titlePart(TOKEN_1))])
        LOGGER.debug('original tei xml: %s', _to_xml(original_tei_xml))
        doc = GrobidTrainingTeiStructuredDocument(
            original_tei_xml,
            tag_to_tei_path_mapping=tag_to_tei_path_mapping,
            preserve_tags=True)
        LOGGER.debug('doc: %s', doc)

        assert [[
            doc.get_tag_or_preserved_tag_value(t)
            for t in doc.get_all_tokens_of_line(line)
        ] for line in _get_all_lines(doc)] == [[TAG_1]]
 def test_should_be_able_get_root_with_updated_single_token_tag_with_attribute(
         self):
     doc = GrobidTrainingTeiStructuredDocument(
         _tei(front_items=[E.note(TOKEN_1)]),
         tag_to_tei_path_mapping={TAG_1: 'div[@tag="tag1"]'})
     lines = _get_all_lines(doc)
     tokens = list(doc.get_tokens_of_line(lines[0]))
     token = tokens[0]
     doc.set_tag(token, TAG_1)
     root = doc.root
     front = root.find('./text/front')
     child_elements = list(front)
     assert [c.tag for c in child_elements] == ['div']
     assert [c.attrib for c in child_elements] == [{'tag': 'tag1'}]
     assert [c.text for c in child_elements] == [TOKEN_1]
 def test_should_find_one_line_with_one_token_at_front_level(self):
     doc = GrobidTrainingTeiStructuredDocument(
         _tei(front_items=[E.note(TOKEN_1)]),
         container_node_path='text/front')
     lines = _get_all_lines(doc)
     assert _get_token_texts_for_lines(doc, lines) == [[TOKEN_1]]
     assert doc.root.find('./text/front/note').text == TOKEN_1
 def test_should_find_empty_first_line_and_text_outside_semantic_element(
         self):
     doc = GrobidTrainingTeiStructuredDocument(
         _tei(front_items=[TOKEN_1,
                           _tei_lb(), TOKEN_2,
                           E.note(TOKEN_3)]))
     lines = _get_all_lines(doc)
     assert _get_token_texts_for_lines(doc, lines) == [[TOKEN_1],
                                                       [TOKEN_2, TOKEN_3]]
    def test_should_remove_untagged_including_line_feed(self):
        doc = GrobidTrainingTeiStructuredDocument(
            _tei(front_items=[
                TOKEN_1,
                E(TeiTagNames.LB), ' ' + TOKEN_2 + ' ' + TOKEN_3
            ]))
        lines = _get_all_lines(doc)

        line1_tokens = list(doc.get_tokens_of_line(lines[0]))
        doc.set_tag(line1_tokens[0], None)

        line2_tokens = list(doc.get_tokens_of_line(lines[1]))
        doc.set_tag(line2_tokens[-1], TAG_1)

        doc.remove_all_untagged()

        root = doc.root
        front = root.find('./text/front')
        LOGGER.debug('xml: %s', _to_xml(front))
        assert _to_xml(front) == (
            '<front><{tag1}>{token3}</{tag1}></front>'.format(tag1=TAG_1,
                                                              token3=TOKEN_3))
 def test_should_be_able_to_set_tag(self):
     doc = GrobidTrainingTeiStructuredDocument(
         _tei(front_items=[E.note(TOKEN_1)]))
     lines = _get_all_lines(doc)
     tokens = list(doc.get_tokens_of_line(lines[0]))
     token = tokens[0]
     doc.set_tag(token, TAG_1)
     assert doc.get_tag(token) == TAG_1
def _add_name_text_suffix(
        structured_document: GrobidTrainingTeiStructuredDocument,
        tokens: List[Any],
        config: ReferenceAnnotatorConfig):
    sub_tags = [structured_document.get_sub_tag(token) for token in tokens]
    token_texts = [structured_document.get_text(token) for token in tokens]
    token_whitespaces = [structured_document.get_whitespace(token) for token in tokens]
    mapped_sub_tags = _map_tags(sub_tags, config.sub_tag_map)
    transformed_sub_tags = get_suffix_extended_token_tags(
        mapped_sub_tags,
        token_texts,
        token_whitespaces=token_whitespaces,
        enabled_tags=config.include_suffix_enabled_sub_tags
    )
    LOGGER.debug(
        'name suffix sub tokens, transformed: %s -> %s -> %s (tokens: %s)',
        sub_tags, mapped_sub_tags, transformed_sub_tags, tokens
    )
    for token, token_sub_tag in zip(tokens, transformed_sub_tags):
        if not token_sub_tag:
            continue
        structured_document.set_sub_tag(token, token_sub_tag)
    return structured_document
 def test_should_be_able_to_set_tag_with_attribute(self):
     doc = GrobidTrainingTeiStructuredDocument(
         _tei(front_items=[E.note(TOKEN_1)]),
         tag_to_tei_path_mapping={TAG_1: 'div[@tag="tag1"]'})
     lines = _get_all_lines(doc)
     tokens = list(doc.get_tokens_of_line(lines[0]))
     token = tokens[0]
     doc.set_tag(token, TAG_1)
     assert doc.get_tag(token) == TAG_1
    def test_should_preserve_existing_tag(self):
        original_tei_xml = _tei(
            front_items=[E.docTitle(E.titlePart(TOKEN_1)), TOKEN_2])
        LOGGER.debug('original tei xml: %s', _to_xml(original_tei_xml))
        doc = GrobidTrainingTeiStructuredDocument(original_tei_xml,
                                                  preserve_tags=True,
                                                  tag_to_tei_path_mapping={})
        LOGGER.debug('doc: %s', doc)

        root = doc.root
        front = root.find('./text/front')
        LOGGER.debug('xml: %s', _to_xml(front))
        assert _to_xml(front) == (
            '<front><docTitle><titlePart>{token1}</titlePart></docTitle>{token2}</front>'
            .format(token1=TOKEN_1, token2=TOKEN_2))
    def test_should_preserve_existing_nested_tag(self):
        original_tei_xml = _tei(
            front_items=[E.div(E.label(TOKEN_1), TOKEN_2), TOKEN_3])
        LOGGER.debug('original tei xml: %s', _to_xml(original_tei_xml))
        doc = GrobidTrainingTeiStructuredDocument(
            original_tei_xml,
            preserve_tags=True,
            tag_to_tei_path_mapping={TAG_1: 'div'})
        LOGGER.debug('doc: %s', doc)

        root = doc.root
        front = root.find('./text/front')
        LOGGER.debug('xml: %s', _to_xml(front))
        assert _to_xml(front) == (
            '<front><div><label>{token1}</label>{token2}</div>{token3}</front>'
            .format(token1=TOKEN_1, token2=TOKEN_2, token3=TOKEN_3))
    def test_should_preserve_separation_between_existing_tag(self):
        original_tei_xml = _tei(front_items=[
            E.byline(E.affiliation(TOKEN_1)),
            E.byline(E.affiliation(TOKEN_2))
        ])
        LOGGER.debug('original tei xml: %s', _to_xml(original_tei_xml))
        doc = GrobidTrainingTeiStructuredDocument(original_tei_xml,
                                                  preserve_tags=True,
                                                  tag_to_tei_path_mapping={})
        LOGGER.debug('doc: %s', doc)

        root = doc.root
        front = root.find('./text/front')
        affiliations = front.xpath('./byline/affiliation')
        LOGGER.debug('xml: %s', _to_xml(front))
        assert [aff.text for aff in affiliations] == [TOKEN_1, TOKEN_2]
    def test_should_preserve_existing_tag_with_attrib(self):
        original_tei_xml = _tei(
            front_items=[E.div(TOKEN_1, {'tag': TAG_1}), TOKEN_2])
        LOGGER.debug('original tei xml: %s', _to_xml(original_tei_xml))
        doc = GrobidTrainingTeiStructuredDocument(
            original_tei_xml,
            preserve_tags=True,
            tag_to_tei_path_mapping={TAG_1: 'div[@tag="tag1"]'})
        LOGGER.debug('doc: %s', doc)

        root = doc.root
        front = root.find('./text/front')
        LOGGER.debug('xml: %s', _to_xml(front))
        assert _to_xml(front) == (
            '<front><div tag="{TAG_1}">{token1}</div>{token2}</front>'.format(
                token1=TOKEN_1, token2=TOKEN_2, TAG_1=TAG_1))
    def test_should_preserve_space_after_lb_in_updated_root(self):
        doc = GrobidTrainingTeiStructuredDocument(
            _tei(front_items=[TOKEN_1,
                              E(TeiTagNames.LB), ' ' + TOKEN_2]))
        lines = _get_all_lines(doc)

        line1_tokens = list(doc.get_tokens_of_line(lines[0]))
        doc.set_tag(line1_tokens[0], TAG_1)

        line2_tokens = list(doc.get_tokens_of_line(lines[1]))
        doc.set_tag(line2_tokens[-1], TAG_1)

        root = doc.root
        front = root.find('./text/front')
        child_elements = list(front)
        assert [c.tag for c in child_elements] == [TAG_1]
        assert _to_xml(child_elements[0]) == (
            '<{tag1}>{token1}<{lb}/> {token2}</{tag1}>'.format(
                tag1=TAG_1, token1=TOKEN_1, token2=TOKEN_2, lb=TeiTagNames.LB))
Example #24
0
def _simple_document_with_tagged_token_lines(
        lines: List[List[Tuple[str,
                               str]]]) -> GrobidTrainingTeiStructuredDocument:
    tei_items = []
    for line in lines:
        tei_items.append(' '.join(token for _, token in line))
        tei_items.append(E.lb())
    doc = GrobidTrainingTeiStructuredDocument(
        _tei(tei_items), container_node_path=DEFAULT_CONTAINER_NODE_PATH)
    doc_lines = [
        line for page in doc.get_pages()
        for line in doc.get_lines_of_page(page)
    ]
    for line, doc_line in zip(lines, doc_lines):
        for (tag, token), doc_token in zip(line,
                                           doc.get_tokens_of_line(doc_line)):
            assert token == doc.get_text(doc_token)
            if tag:
                doc.set_tag(doc_token, tag)
    return doc
    def test_should_not_include_space_in_tag_if_previous_token_has_different_tag(
            self):
        doc = GrobidTrainingTeiStructuredDocument(
            _tei(front_items=[TOKEN_1,
                              E(TeiTagNames.LB), ' ' + TOKEN_2]))
        lines = _get_all_lines(doc)

        line1_tokens = list(doc.get_tokens_of_line(lines[0]))
        doc.set_tag(line1_tokens[0], TAG_1)

        line2_tokens = list(doc.get_tokens_of_line(lines[1]))
        doc.set_tag(line2_tokens[-1], TAG_2)

        root = doc.root
        front = root.find('./text/front')
        LOGGER.debug('xml: %s', _to_xml(front))
        assert _to_xml(front) == ('<front><{tag1}>{token1}<{lb}/></{tag1}>'
                                  ' <{tag2}>{token2}</{tag2}></front>'.format(
                                      tag1=TAG_1,
                                      tag2=TAG_2,
                                      token1=TOKEN_1,
                                      token2=TOKEN_2,
                                      lb=TeiTagNames.LB))
    def test_should_not_return_preserved_tag_as_tag_and_update_preserved_tag(
            self):
        original_tei_xml = _tei(front_items=[E.note(TOKEN_1)])
        LOGGER.debug('original tei xml: %s', _to_xml(original_tei_xml))
        doc = GrobidTrainingTeiStructuredDocument(original_tei_xml,
                                                  preserve_tags=True,
                                                  tag_to_tei_path_mapping={})
        LOGGER.debug('doc: %s', doc)
        lines = _get_all_lines(doc)
        token1 = list(doc.get_tokens_of_line(lines[0]))[0]
        assert not doc.get_tag(token1)
        doc.set_tag(token1, TAG_1)

        root = doc.root
        front = root.find('./text/front')
        LOGGER.debug('xml: %s', _to_xml(front))
        assert _to_xml(front) == (
            '<front><{tag1}>{token1}</{tag1}></front>'.format(token1=TOKEN_1,
                                                              tag1=TAG_1))
def _structured_document_with_title(title=TITLE_1):
    return GrobidTrainingTeiStructuredDocument(_tei_with_title(title))
def _structured_document_with_sub_elements(*args, **kwargs):
    return GrobidTrainingTeiStructuredDocument(
        _tei_with_sub_elements(*args, **kwargs),
        tag_to_tei_path_mapping=TAG_TO_TEI_PATH_MAPPING)
Example #29
0
 def test_should_not_fail_on_empty_document(self):
     structured_document = GrobidTrainingTeiStructuredDocument(_tei())
     TextLineNumberAnnotator().annotate(structured_document)
Example #30
0
def iter_first_tokens_of_lines(
        structured_document: GrobidTrainingTeiStructuredDocument, lines: list):
    for line in lines:
        text_tokens = structured_document.get_tokens_of_line(line)
        if text_tokens:
            yield text_tokens[0]