def iter_find_line_number_tokens( structured_document: GrobidTrainingTeiStructuredDocument, **kwargs): for page in structured_document.get_pages(): line_no_tokens = iter_find_line_number_tokens_in_lines( structured_document, lines=structured_document.get_lines_of_page(page), **kwargs) yield from line_no_tokens
def _get_document_tagged_token_lines( doc: GrobidTrainingTeiStructuredDocument ) -> List[List[Tuple[str, str]]]: document_tagged_token_lines = [[(doc.get_tag(token), doc.get_text(token)) for token in doc.get_tokens_of_line(line)] for page in doc.get_pages() for line in doc.get_lines_of_page(page)] LOGGER.debug('document_tagged_token_lines: %s', document_tagged_token_lines) return document_tagged_token_lines
def _simple_document_with_tagged_token_lines( lines: List[List[Tuple[str, str]]]) -> GrobidTrainingTeiStructuredDocument: tei_items = [] for line in lines: tei_items.append(' '.join(token for _, token in line)) tei_items.append(E.lb()) doc = GrobidTrainingTeiStructuredDocument( _tei(tei_items), container_node_path=DEFAULT_CONTAINER_NODE_PATH) doc_lines = [ line for page in doc.get_pages() for line in doc.get_lines_of_page(page) ] for line, doc_line in zip(lines, doc_lines): for (tag, token), doc_token in zip(line, doc.get_tokens_of_line(doc_line)): assert token == doc.get_text(doc_token) if tag: doc.set_tag(doc_token, tag) return doc
def test_should_return_root_as_pages(self): root = _tei(front_items=[]) doc = GrobidTrainingTeiStructuredDocument(root) assert list(doc.get_pages()) == [root]