def _merge_sub_tags(
        structured_document: AbstractStructuredDocument,
        tokens: List[Any],
        config: ReferenceAnnotatorConfig):
    sub_tags = [structured_document.get_sub_tag(token) for token in tokens]
    mapped_sub_tags = _map_tags(sub_tags, config.sub_tag_map)
    transformed_sub_tags = get_extended_line_token_tags(
        mapped_sub_tags,
        extend_to_line_enabled_map={},
        merge_enabled_map={
            key: True
            for key in config.merge_enabled_sub_tags
        },
        default_merge_enabled=False,
        default_extend_to_line_enabled=False
    )
    LOGGER.debug(
        'sub tokens, transformed: %s -> %s -> %s (tokens: %s)',
        sub_tags, mapped_sub_tags, transformed_sub_tags, tokens
    )
    for token, token_sub_tag in zip(tokens, transformed_sub_tags):
        if not token_sub_tag:
            continue
        structured_document.set_sub_tag(token, token_sub_tag)
    return structured_document
def _iter_all_tokens(
        structured_document: AbstractStructuredDocument) -> Iterable[Any]:
    return (
        token
        for page in structured_document.get_pages()
        for line in structured_document.get_lines_of_page(page)
        for token in structured_document.get_tokens_of_line(line)
    )
def _map_etal_sub_tag(
        structured_document: AbstractStructuredDocument,
        tokens: List[Any],
        config: ReferenceAnnotatorConfig):
    sub_tags = [structured_document.get_sub_tag(token) for token in tokens]
    mapped_sub_tags = _map_tags(sub_tags, config.sub_tag_map)
    transformed_sub_tags = get_etal_mapped_tags(
        mapped_sub_tags,
        etal_sub_tag=config.etal_sub_tag,
        etal_merge_enabled_sub_tags=config.etal_merge_enabled_sub_tags
    )
    LOGGER.debug(
        'etal sub tokens, transformed: %s -> %s -> %s (tokens: %s)',
        sub_tags, mapped_sub_tags, transformed_sub_tags, tokens
    )
    for token, token_sub_tag in zip(tokens, transformed_sub_tags):
        if not token_sub_tag:
            continue
        structured_document.set_sub_tag(token, token_sub_tag)
    return structured_document
 def from_structured_document(
         structured_document: AbstractStructuredDocument,
         normalize_fn: Callable[[str], str]):
     pending_sequences: List[SequenceWrapper] = []
     for line in iter_lines(structured_document):
         tokens = get_untagged_line_tokens(structured_document, line)
         if tokens:
             LOGGER.debug(
                 'tokens without tag: %s',
                 [structured_document.get_text(token) for token in tokens])
             pending_sequences.append(
                 SequenceWrapperWithPosition(
                     structured_document,
                     tokens,
                     normalize_fn,
                     position=len(pending_sequences)))
     return PendingSequences(pending_sequences)
 def __init__(self,
              structured_document: AbstractStructuredDocument,
              tokens: List[T_Token],
              str_filter_f: Optional[Callable[[str], str]] = None):
     self.structured_document = structured_document
     self.str_filter_f = str_filter_f
     self.tokens = tokens
     self.token_str_list = [
         structured_document.get_text(t) or '' for t in tokens
     ]
     if str_filter_f:
         self.token_str_list = [
             str_filter_f(s) for s in self.token_str_list
         ]
     self.joined_text = JoinedText(
         self.token_str_list,
         sep=' ',
         whitespace_list=[get_token_whitespace(t) for t in tokens])
     self.tokens_as_str = str(self.joined_text)
 def annotate(self, structured_document: AbstractStructuredDocument):
     assert isinstance(structured_document,
                       GrobidTrainingTeiStructuredDocument)
     LOGGER.debug('preserving tags')
     token_preserved_tags = [
         (token, structured_document.get_tag_or_preserved_tag(token))
         for token in iter_all_tokens_including_space(structured_document)
     ]
     # we need to clear the tag for now, otherwise they will be ignored for annotation
     for token, _ in token_preserved_tags:
         structured_document.set_tag_only(token, None)
         if not self.config.preserve_sub_annotations:
             structured_document.clear_preserved_sub_tag(token)
     # process auto-annotations
     super().annotate(structured_document)
     # restore original tags (but now with auto-annotated sub-tags)
     for token, preserved_tag in token_preserved_tags:
         preserved_tag = structured_document.get_tag_or_preserved_tag(token)
         LOGGER.debug('restoring preserved tag: %r -> %r', token,
                      preserved_tag)
         structured_document.set_tag_only(token, preserved_tag)
     return structured_document
def _iter_group_tokens_by_tag_entity(
        structured_document: AbstractStructuredDocument,
        tokens: Iterable[T]) -> Iterable[Tuple[Optional[str], List[T]]]:
    pending_tag_value = None
    pending_tokens = None
    for token in tokens:
        current_full_tag = structured_document.get_tag(token)
        current_tag_prefix, current_tag_value = split_tag_prefix(current_full_tag)
        if (
            pending_tokens
            and (
                pending_tag_value != current_tag_value
                or current_tag_prefix == B_TAG_PREFIX
            )
        ):
            yield pending_tag_value, pending_tokens
            pending_tokens = None
        if not pending_tokens:
            pending_tag_value = current_tag_value
            pending_tokens = [token]
            continue
        pending_tokens.append(token)
    if pending_tokens:
        yield pending_tag_value, pending_tokens
Esempio n. 8
0
 def annotate(self, structured_document: AbstractStructuredDocument):
     assert isinstance(structured_document,
                       GrobidTrainingTeiStructuredDocument)
     structured_document.remove_all_untagged()
     return structured_document
def get_untagged_line_tokens(structured_document: AbstractStructuredDocument,
                             line):
    return [
        token for token in structured_document.get_tokens_of_line(line)
        if not structured_document.get_tag(token)
    ]
def iter_lines(structured_document: AbstractStructuredDocument):
    return (line for page in structured_document.get_pages()
            for line in structured_document.get_lines_of_page(page))
def iter_all_tokens_including_space(
        structured_document: AbstractStructuredDocument) -> Iterable[Any]:
    assert isinstance(structured_document, GrobidTrainingTeiStructuredDocument)
    return (token for page in structured_document.get_pages()
            for line in structured_document.get_lines_of_page(page)
            for token in structured_document.get_all_tokens_of_line(line))