Example #1
0
def iter_find_line_number_tokens(
        structured_document: GrobidTrainingTeiStructuredDocument, **kwargs):

    for page in structured_document.get_pages():
        line_no_tokens = iter_find_line_number_tokens_in_lines(
            structured_document,
            lines=structured_document.get_lines_of_page(page),
            **kwargs)
        yield from line_no_tokens
Example #2
0
def _get_document_tagged_token_lines(
        doc: GrobidTrainingTeiStructuredDocument
) -> List[List[Tuple[str, str]]]:
    document_tagged_token_lines = [[(doc.get_tag(token), doc.get_text(token))
                                    for token in doc.get_tokens_of_line(line)]
                                   for page in doc.get_pages()
                                   for line in doc.get_lines_of_page(page)]
    LOGGER.debug('document_tagged_token_lines: %s',
                 document_tagged_token_lines)
    return document_tagged_token_lines
Example #3
0
def _simple_document_with_tagged_token_lines(
        lines: List[List[Tuple[str,
                               str]]]) -> GrobidTrainingTeiStructuredDocument:
    tei_items = []
    for line in lines:
        tei_items.append(' '.join(token for _, token in line))
        tei_items.append(E.lb())
    doc = GrobidTrainingTeiStructuredDocument(
        _tei(tei_items), container_node_path=DEFAULT_CONTAINER_NODE_PATH)
    doc_lines = [
        line for page in doc.get_pages()
        for line in doc.get_lines_of_page(page)
    ]
    for line, doc_line in zip(lines, doc_lines):
        for (tag, token), doc_token in zip(line,
                                           doc.get_tokens_of_line(doc_line)):
            assert token == doc.get_text(doc_token)
            if tag:
                doc.set_tag(doc_token, tag)
    return doc