def get_line_number_candidates( structured_document: GrobidTrainingTeiStructuredDocument, tokens: list, min_line_number: int, max_line_number_gap: int): line_number_candidates_with_num = [ (token, parse_line_number(structured_document.get_text(token)), 1 + index) for index, token in enumerate(tokens) if is_line_number(structured_document.get_text(token)) ] if not line_number_candidates_with_num: return [] line_number_candidates_with_num = sorted(line_number_candidates_with_num, key=lambda item: (item[1], item[2])) line_number_sequences = [[line_number_candidates_with_num[0]]] for item in line_number_candidates_with_num[1:]: token, num, token_pos = item prev_seq = line_number_sequences[-1] prev_item = prev_seq[-1] _, prev_num, prev_token_pos = prev_item expected_num = prev_num + 1 if token_pos < prev_token_pos or num == prev_num: LOGGER.debug('ignoring out of sequence: %s (prev: %s)', item, prev_item) elif expected_num <= num <= expected_num + max_line_number_gap: prev_seq.append(item) else: line_number_sequences.append([item]) accepted_line_number_sequences = [ seq for seq in line_number_sequences if len(seq) >= min_line_number ] return [ token for seq in accepted_line_number_sequences for token, _, _ in seq ]
def _get_document_tagged_token_lines( doc: GrobidTrainingTeiStructuredDocument ) -> List[List[Tuple[str, str]]]: document_tagged_token_lines = [[(doc.get_tag(token), doc.get_text(token)) for token in doc.get_tokens_of_line(line)] for page in doc.get_pages() for line in doc.get_lines_of_page(page)] LOGGER.debug('document_tagged_token_lines: %s', document_tagged_token_lines) return document_tagged_token_lines
def _simple_document_with_tagged_token_lines( lines: List[List[Tuple[str, str]]]) -> GrobidTrainingTeiStructuredDocument: tei_items = [] for line in lines: tei_items.append(' '.join(token for _, token in line)) tei_items.append(E.lb()) doc = GrobidTrainingTeiStructuredDocument( _tei(tei_items), container_node_path=DEFAULT_CONTAINER_NODE_PATH) doc_lines = [ line for page in doc.get_pages() for line in doc.get_lines_of_page(page) ] for line, doc_line in zip(lines, doc_lines): for (tag, token), doc_token in zip(line, doc.get_tokens_of_line(doc_line)): assert token == doc.get_text(doc_token) if tag: doc.set_tag(doc_token, tag) return doc
def _add_name_text_suffix( structured_document: GrobidTrainingTeiStructuredDocument, tokens: List[Any], config: ReferenceAnnotatorConfig): sub_tags = [structured_document.get_sub_tag(token) for token in tokens] token_texts = [structured_document.get_text(token) for token in tokens] token_whitespaces = [structured_document.get_whitespace(token) for token in tokens] mapped_sub_tags = _map_tags(sub_tags, config.sub_tag_map) transformed_sub_tags = get_suffix_extended_token_tags( mapped_sub_tags, token_texts, token_whitespaces=token_whitespaces, enabled_tags=config.include_suffix_enabled_sub_tags ) LOGGER.debug( 'name suffix sub tokens, transformed: %s -> %s -> %s (tokens: %s)', sub_tags, mapped_sub_tags, transformed_sub_tags, tokens ) for token, token_sub_tag in zip(tokens, transformed_sub_tags): if not token_sub_tag: continue structured_document.set_sub_tag(token, token_sub_tag) return structured_document