Esempi in Python per AbstractStructuredDocument

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: sciencebeam_trainer_grobid_tools.core.structured_document

Classe/tipologia: AbstractStructuredDocument

Esempi su hotexamples.com: 11

AbstractStructuredDocument in Python: 11 esempi trovati. Questi sono i migliori esempi reali in Python per sciencebeam_trainer_grobid_tools.core.structured_document.AbstractStructuredDocument, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

get_lines_of_page(3)

get_pages(3)

get_sub_tag(2)

get_tag(2)

get_text(2)

get_tokens_of_line(2)

set_sub_tag(2)

clear_preserved_sub_tag(1)

get_all_tokens_of_line(1)

get_tag_or_preserved_tag(1)

remove_all_untagged(1)

set_tag_only(1)

Esempio n. 1

Mostra file

File: reference_annotator.py Progetto: elifesciences/sciencebeam-trainer-grobid-tools

def _merge_sub_tags(
        structured_document: AbstractStructuredDocument,
        tokens: List[Any],
        config: ReferenceAnnotatorConfig):
    sub_tags = [structured_document.get_sub_tag(token) for token in tokens]
    mapped_sub_tags = _map_tags(sub_tags, config.sub_tag_map)
    transformed_sub_tags = get_extended_line_token_tags(
        mapped_sub_tags,
        extend_to_line_enabled_map={},
        merge_enabled_map={
            key: True
            for key in config.merge_enabled_sub_tags
        },
        default_merge_enabled=False,
        default_extend_to_line_enabled=False
    )
    LOGGER.debug(
        'sub tokens, transformed: %s -> %s -> %s (tokens: %s)',
        sub_tags, mapped_sub_tags, transformed_sub_tags, tokens
    )
    for token, token_sub_tag in zip(tokens, transformed_sub_tags):
        if not token_sub_tag:
            continue
        structured_document.set_sub_tag(token, token_sub_tag)
    return structured_document

Esempio n. 2

Mostra file

File: reference_annotator.py Progetto: elifesciences/sciencebeam-trainer-grobid-tools

def _iter_all_tokens(
        structured_document: AbstractStructuredDocument) -> Iterable[Any]:
    return (
        token
        for page in structured_document.get_pages()
        for line in structured_document.get_lines_of_page(page)
        for token in structured_document.get_tokens_of_line(line)
    )

Esempio n. 3

Mostra file

File: reference_annotator.py Progetto: elifesciences/sciencebeam-trainer-grobid-tools

def _map_etal_sub_tag(
        structured_document: AbstractStructuredDocument,
        tokens: List[Any],
        config: ReferenceAnnotatorConfig):
    sub_tags = [structured_document.get_sub_tag(token) for token in tokens]
    mapped_sub_tags = _map_tags(sub_tags, config.sub_tag_map)
    transformed_sub_tags = get_etal_mapped_tags(
        mapped_sub_tags,
        etal_sub_tag=config.etal_sub_tag,
        etal_merge_enabled_sub_tags=config.etal_merge_enabled_sub_tags
    )
    LOGGER.debug(
        'etal sub tokens, transformed: %s -> %s -> %s (tokens: %s)',
        sub_tags, mapped_sub_tags, transformed_sub_tags, tokens
    )
    for token, token_sub_tag in zip(tokens, transformed_sub_tags):
        if not token_sub_tag:
            continue
        structured_document.set_sub_tag(token, token_sub_tag)
    return structured_document

Esempio n. 4

Mostra file

File: matching_utils.py Progetto: elifesciences/sciencebeam-trainer-grobid-tools

 def from_structured_document(
         structured_document: AbstractStructuredDocument,
         normalize_fn: Callable[[str], str]):
     pending_sequences: List[SequenceWrapper] = []
     for line in iter_lines(structured_document):
         tokens = get_untagged_line_tokens(structured_document, line)
         if tokens:
             LOGGER.debug(
                 'tokens without tag: %s',
                 [structured_document.get_text(token) for token in tokens])
             pending_sequences.append(
                 SequenceWrapperWithPosition(
                     structured_document,
                     tokens,
                     normalize_fn,
                     position=len(pending_sequences)))
     return PendingSequences(pending_sequences)

Esempio n. 5

Mostra file

File: matching_utils.py Progetto: elifesciences/sciencebeam-trainer-grobid-tools

 def __init__(self,
              structured_document: AbstractStructuredDocument,
              tokens: List[T_Token],
              str_filter_f: Optional[Callable[[str], str]] = None):
     self.structured_document = structured_document
     self.str_filter_f = str_filter_f
     self.tokens = tokens
     self.token_str_list = [
         structured_document.get_text(t) or '' for t in tokens
     ]
     if str_filter_f:
         self.token_str_list = [
             str_filter_f(s) for s in self.token_str_list
         ]
     self.joined_text = JoinedText(
         self.token_str_list,
         sep=' ',
         whitespace_list=[get_token_whitespace(t) for t in tokens])
     self.tokens_as_str = str(self.joined_text)

Esempio n. 6

Mostra file

File: sub_tag_annotator.py Progetto: elifesciences/sciencebeam-trainer-grobid-tools

 def annotate(self, structured_document: AbstractStructuredDocument):
     assert isinstance(structured_document,
                       GrobidTrainingTeiStructuredDocument)
     LOGGER.debug('preserving tags')
     token_preserved_tags = [
         (token, structured_document.get_tag_or_preserved_tag(token))
         for token in iter_all_tokens_including_space(structured_document)
     ]
     # we need to clear the tag for now, otherwise they will be ignored for annotation
     for token, _ in token_preserved_tags:
         structured_document.set_tag_only(token, None)
         if not self.config.preserve_sub_annotations:
             structured_document.clear_preserved_sub_tag(token)
     # process auto-annotations
     super().annotate(structured_document)
     # restore original tags (but now with auto-annotated sub-tags)
     for token, preserved_tag in token_preserved_tags:
         preserved_tag = structured_document.get_tag_or_preserved_tag(token)
         LOGGER.debug('restoring preserved tag: %r -> %r', token,
                      preserved_tag)
         structured_document.set_tag_only(token, preserved_tag)
     return structured_document

Esempio n. 7

Mostra file

File: reference_annotator.py Progetto: elifesciences/sciencebeam-trainer-grobid-tools

def _iter_group_tokens_by_tag_entity(
        structured_document: AbstractStructuredDocument,
        tokens: Iterable[T]) -> Iterable[Tuple[Optional[str], List[T]]]:
    pending_tag_value = None
    pending_tokens = None
    for token in tokens:
        current_full_tag = structured_document.get_tag(token)
        current_tag_prefix, current_tag_value = split_tag_prefix(current_full_tag)
        if (
            pending_tokens
            and (
                pending_tag_value != current_tag_value
                or current_tag_prefix == B_TAG_PREFIX
            )
        ):
            yield pending_tag_value, pending_tokens
            pending_tokens = None
        if not pending_tokens:
            pending_tag_value = current_tag_value
            pending_tokens = [token]
            continue
        pending_tokens.append(token)
    if pending_tokens:
        yield pending_tag_value, pending_tokens

Esempio n. 8

Mostra file

 def annotate(self, structured_document: AbstractStructuredDocument):
     assert isinstance(structured_document,
                       GrobidTrainingTeiStructuredDocument)
     structured_document.remove_all_untagged()
     return structured_document

Esempio n. 9

Mostra file

File: matching_utils.py Progetto: elifesciences/sciencebeam-trainer-grobid-tools

def get_untagged_line_tokens(structured_document: AbstractStructuredDocument,
                             line):
    return [
        token for token in structured_document.get_tokens_of_line(line)
        if not structured_document.get_tag(token)
    ]

Esempio n. 10

Mostra file

File: matching_utils.py Progetto: elifesciences/sciencebeam-trainer-grobid-tools

def iter_lines(structured_document: AbstractStructuredDocument):
    return (line for page in structured_document.get_pages()
            for line in structured_document.get_lines_of_page(page))

Esempio n. 11

Mostra file

File: utils.py Progetto: elifesciences/sciencebeam-trainer-grobid-tools

def iter_all_tokens_including_space(
        structured_document: AbstractStructuredDocument) -> Iterable[Any]:
    assert isinstance(structured_document, GrobidTrainingTeiStructuredDocument)
    return (token for page in structured_document.get_pages()
            for line in structured_document.get_lines_of_page(page)
            for token in structured_document.get_all_tokens_of_line(line))