Example #1
0
def get_line_number_candidates(
        structured_document: GrobidTrainingTeiStructuredDocument, tokens: list,
        min_line_number: int, max_line_number_gap: int):
    line_number_candidates_with_num = [
        (token, parse_line_number(structured_document.get_text(token)),
         1 + index) for index, token in enumerate(tokens)
        if is_line_number(structured_document.get_text(token))
    ]
    if not line_number_candidates_with_num:
        return []
    line_number_candidates_with_num = sorted(line_number_candidates_with_num,
                                             key=lambda item:
                                             (item[1], item[2]))
    line_number_sequences = [[line_number_candidates_with_num[0]]]
    for item in line_number_candidates_with_num[1:]:
        token, num, token_pos = item
        prev_seq = line_number_sequences[-1]
        prev_item = prev_seq[-1]
        _, prev_num, prev_token_pos = prev_item
        expected_num = prev_num + 1
        if token_pos < prev_token_pos or num == prev_num:
            LOGGER.debug('ignoring out of sequence: %s (prev: %s)', item,
                         prev_item)
        elif expected_num <= num <= expected_num + max_line_number_gap:
            prev_seq.append(item)
        else:
            line_number_sequences.append([item])
    accepted_line_number_sequences = [
        seq for seq in line_number_sequences if len(seq) >= min_line_number
    ]
    return [
        token for seq in accepted_line_number_sequences for token, _, _ in seq
    ]
Example #2
0
def _get_document_tagged_token_lines(
        doc: GrobidTrainingTeiStructuredDocument
) -> List[List[Tuple[str, str]]]:
    document_tagged_token_lines = [[(doc.get_tag(token), doc.get_text(token))
                                    for token in doc.get_tokens_of_line(line)]
                                   for page in doc.get_pages()
                                   for line in doc.get_lines_of_page(page)]
    LOGGER.debug('document_tagged_token_lines: %s',
                 document_tagged_token_lines)
    return document_tagged_token_lines
Example #3
0
def _simple_document_with_tagged_token_lines(
        lines: List[List[Tuple[str,
                               str]]]) -> GrobidTrainingTeiStructuredDocument:
    tei_items = []
    for line in lines:
        tei_items.append(' '.join(token for _, token in line))
        tei_items.append(E.lb())
    doc = GrobidTrainingTeiStructuredDocument(
        _tei(tei_items), container_node_path=DEFAULT_CONTAINER_NODE_PATH)
    doc_lines = [
        line for page in doc.get_pages()
        for line in doc.get_lines_of_page(page)
    ]
    for line, doc_line in zip(lines, doc_lines):
        for (tag, token), doc_token in zip(line,
                                           doc.get_tokens_of_line(doc_line)):
            assert token == doc.get_text(doc_token)
            if tag:
                doc.set_tag(doc_token, tag)
    return doc
def _add_name_text_suffix(
        structured_document: GrobidTrainingTeiStructuredDocument,
        tokens: List[Any],
        config: ReferenceAnnotatorConfig):
    sub_tags = [structured_document.get_sub_tag(token) for token in tokens]
    token_texts = [structured_document.get_text(token) for token in tokens]
    token_whitespaces = [structured_document.get_whitespace(token) for token in tokens]
    mapped_sub_tags = _map_tags(sub_tags, config.sub_tag_map)
    transformed_sub_tags = get_suffix_extended_token_tags(
        mapped_sub_tags,
        token_texts,
        token_whitespaces=token_whitespaces,
        enabled_tags=config.include_suffix_enabled_sub_tags
    )
    LOGGER.debug(
        'name suffix sub tokens, transformed: %s -> %s -> %s (tokens: %s)',
        sub_tags, mapped_sub_tags, transformed_sub_tags, tokens
    )
    for token, token_sub_tag in zip(tokens, transformed_sub_tags):
        if not token_sub_tag:
            continue
        structured_document.set_sub_tag(token, token_sub_tag)
    return structured_document