def annotate(self, structured_document: GrobidTrainingTeiStructuredDocument):
     all_tokens_iterable = iter_all_tokens_excluding_space(structured_document)
     previous_untagged_tokens = []
     ignored_token_tag_values = set()
     for token in all_tokens_iterable:
         tag_value = structured_document.get_tag_or_preserved_tag_value(token)
         if not tag_value:
             previous_untagged_tokens.append(token)
             continue
         if not previous_untagged_tokens:
             continue
         if tag_value not in self.config.enabled_tags:
             LOGGER.debug(
                 'ignoring untagged tokens (before %r): %s',
                 tag_value, previous_untagged_tokens
             )
             ignored_token_tag_values.add(tag_value)
             previous_untagged_tokens.clear()
             continue
         LOGGER.debug(
             'updated untagged tokens (before %r): %s',
             tag_value, previous_untagged_tokens
         )
         for index, previous_untagged_token in enumerate(previous_untagged_tokens):
             structured_document.set_tag_only(
                 previous_untagged_token,
                 add_tag_prefix(tag_value, B_TAG_PREFIX if index == 0 else I_TAG_PREFIX)
             )
         structured_document.set_tag_only(
             token,
             add_tag_prefix(tag_value, I_TAG_PREFIX)
         )
         previous_untagged_tokens.clear()
     LOGGER.debug('ignore not enabled tag values: %s', ignored_token_tag_values)
     return structured_document
 def annotate(self, structured_document: GrobidTrainingTeiStructuredDocument):
     all_tokens_iterable = iter_all_tokens_excluding_space(structured_document)
     previous_enabled_tag_value: Optional[str] = None
     previous_included_tokens: List[Any] = []
     ignored_token_tag_values = set()
     for token in all_tokens_iterable:
         tag_value = structured_document.get_tag_or_preserved_tag_value(token)
         if tag_value:
             _log_previous_included_tokens(
                 previous_enabled_tag_value, previous_included_tokens
             )
             previous_included_tokens.clear()
             previous_enabled_tag_value = (
                 tag_value
                 if tag_value in self.config.enabled_tags
                 else None
             )
             if not previous_enabled_tag_value:
                 ignored_token_tag_values.add(tag_value)
             continue
         if not previous_enabled_tag_value:
             continue
         structured_document.set_tag_only(
             token,
             add_tag_prefix(previous_enabled_tag_value, I_TAG_PREFIX)
         )
         previous_included_tokens.append(token)
     _log_previous_included_tokens(
         previous_enabled_tag_value, previous_included_tokens
     )
     LOGGER.debug('ignore not enabled tag values: %s', ignored_token_tag_values)
     return structured_document
Ejemplo n.º 3
0
 def annotate(self,
              structured_document: GrobidTrainingTeiStructuredDocument):
     # There is currently no support for more than two level tagging,
     # which would allow the a parent level to be represented.
     # As a workaround we are adding the a separate parent tag, to the tokens without tag.
     # That way those tokens will share a common parent element in the output.
     tag_level = self.config.tag_level
     all_tokens_iterable = iter_all_tokens_excluding_space(
         structured_document)
     unmatched_tags = set()
     current_group_tag = None
     for token in all_tokens_iterable:
         if tag_level:
             # if we looking at sub tags, then only consider tokens with a tag
             if not structured_document.get_tag_or_preserved_tag(token):
                 continue
         tag = structured_document.get_tag_or_preserved_tag_value(
             token, level=tag_level)
         if tag:
             current_group_tag = self.config.get_group_tag_for_tag_fn(tag)
             if not current_group_tag:
                 unmatched_tags.add(tag)
             continue
         if not current_group_tag:
             continue
         structured_document.set_tag_only(token,
                                          add_tag_prefix(
                                              current_group_tag,
                                              prefix=I_TAG_PREFIX),
                                          level=tag_level)
         LOGGER.debug('updated group token (%r): %s', current_group_tag,
                      token)
     LOGGER.debug('ignored unmatched tags: %s', unmatched_tags)
     return structured_document