Esempio n. 1
0
def convert_two_letter_uppercase_given_name_to_given_middle_name(
        name: T_SemanticName):
    given_names = list(name.iter_by_type(SemanticGivenName))
    middle_names = list(name.iter_by_type(SemanticMiddleName))
    if middle_names:
        LOGGER.debug('already has a middle name: %r', middle_names)
        return
    if len(given_names) != 1:
        LOGGER.debug('no or too many given names: %r', given_names)
        return
    given_name_text = given_names[0].get_text()
    if len(given_name_text) != 2 or not given_name_text.isupper():
        LOGGER.debug('not two uppercase characters: %r', given_name_text)
        return
    layout_document = LayoutDocument.for_blocks(
        list(given_names[0].iter_blocks()))
    retokenized_layout_document = layout_document.retokenize(
        tokenize_fn=tokenize_individual_characters)
    LOGGER.debug('retokenized_layout_document: %r',
                 retokenized_layout_document)
    split_name_parts = [
        (SemanticGivenName(layout_block=LayoutBlock.for_tokens([token]))
         if index == 0 else SemanticMiddleName(
             layout_block=LayoutBlock.for_tokens([token])))
        for index, token in enumerate(
            retokenized_layout_document.iter_all_tokens())
    ]
    LOGGER.debug('split_name_parts: %r', split_name_parts)
    name.flat_map_inplace_by_type(SemanticGivenName,
                                  lambda _: split_name_parts)
 def test_should_retokenize_document_with_placeholders(self):
     text = 'token1 token2'
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_tokens([
                 LayoutToken(text,
                             whitespace='\n',
                             coordinates=LayoutPageCoordinates(
                                 x=10, y=10, width=100, height=50))
             ])
         ],
                    graphics=[])
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     line = retokenized_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1', 'token2']
     assert [t.whitespace for t in line.tokens] == [' ', '\n']
     assert line.tokens[0].coordinates.x == 10.0
     assert line.tokens[0].coordinates.width == 100 * len('token1') / len(
         text)
     assert line.tokens[
         1].coordinates.x == 10.0 + 100 * len('token1 ') / len(text)
     assert line.tokens[1].coordinates.width == 100 * len('token2') / len(
         text)
Esempio n. 3
0
def iter_entity_layout_blocks_for_labeled_layout_tokens(
    labeled_layout_tokens: Iterable[LabeledLayoutToken]
) -> Iterable[Tuple[str, LayoutBlock]]:
    layout_tokens = [result.layout_token for result in labeled_layout_tokens]
    labels = [result.label for result in labeled_layout_tokens]
    LOGGER.debug('layout_tokens: %s', layout_tokens)
    LOGGER.debug('labels: %s', labels)
    for tag, start, end in get_entities_including_other(list(labels)):
        yield tag, LayoutBlock.for_tokens(layout_tokens[start:end + 1])
 def test_should_remove_blank_token(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[LayoutBlock.for_tokens([LayoutToken(' ')])],
                    graphics=[])
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     line = retokenized_layout_document.pages[0].blocks[0].lines[0]
     assert line.tokens == []
 def test_should_not_retokenize_document_with_valid_tokens(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(
             blocks=[LayoutBlock.for_tokens([LayoutToken('token1')])],
             graphics=[])
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     line = retokenized_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1']
Esempio n. 6
0
 def test_should_add_superscript_text(self):
     block = LayoutBlock.for_tokens([
         LayoutToken(TOKEN_1),
         LayoutToken(TOKEN_2, font=SUPERSCRIPT_FONT_1),
         LayoutToken(TOKEN_3)
     ])
     node = TEI_E.node(*iter_layout_block_tei_children(block))
     assert get_tei_xpath_text_content_list(
         node, './tei:hi[@rend="superscript"]') == [TOKEN_2]
     assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
Esempio n. 7
0
def get_section_label_and_title_from_layout_block(
    layout_block: LayoutBlock
) -> Tuple[Optional[LayoutBlock], LayoutBlock]:
    if not layout_block:
        return None, layout_block
    layout_tokens_text = LayoutTokensText(layout_block)
    text = str(layout_tokens_text)
    m = re.match(HEADER_LABEL_REGEX, text, re.IGNORECASE)
    if not m:
        return None, layout_block
    label_end = m.end(1)
    title_start = m.start(2)
    LOGGER.debug('label_end: %d, title_start: %d (text: %r)', label_end, title_start, text)
    section_label_layout_block = LayoutBlock.for_tokens(list(
        layout_tokens_text.iter_layout_tokens_between(0, label_end)
    ))
    section_title_layout_block = LayoutBlock.for_tokens(list(
        layout_tokens_text.iter_layout_tokens_between(title_start, len(text))
    ))
    return section_label_layout_block, section_title_layout_block
 def test_should_preserve_meta(self):
     page_meta = LayoutPageMeta(COORDINATES_1)
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_tokens([LayoutToken('token1 token2')])
         ],
                    graphics=[],
                    meta=page_meta)
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     page = retokenized_layout_document.pages[0]
     assert page.meta == page_meta
Esempio n. 9
0
 def test_should_add_bold_and_italics_text(self):
     block = LayoutBlock.for_tokens([
         LayoutToken(TOKEN_1),
         LayoutToken(TOKEN_2, font=BOLD_ITALICS_FONT_1),
         LayoutToken(TOKEN_3)
     ])
     node = TEI_E.node(*iter_layout_block_tei_children(block))
     LOGGER.debug('xml: %r', etree.tostring(node))
     assert get_tei_xpath_text_content_list(
         node, './/tei:hi[@rend="bold"]') == [TOKEN_2]
     assert get_tei_xpath_text_content_list(
         node, './/tei:hi[@rend="italic"]') == [TOKEN_2]
     assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
Esempio n. 10
0
def iter_semantic_markers_for_layout_block(
    layout_block: LayoutBlock
) -> Iterable[Union[SemanticMarker, SemanticContentWrapper]]:
    for text in re.split(r'(\D)', layout_block.text):
        if not text:
            continue
        local_block = LayoutBlock.for_tokens(
            [LayoutToken(text, whitespace='')])
        if text == ',' or text.isspace():
            yield SemanticNote(layout_block=local_block,
                               note_type='marker_delimiter')
            continue
        yield SemanticMarker(layout_block=local_block)
Esempio n. 11
0
def get_cleaned_abstract_layout_block(layout_block: LayoutBlock) -> LayoutBlock:
    if not layout_block or not layout_block.lines:
        return layout_block
    layout_tokens_text = LayoutTokensText(layout_block)
    text = str(layout_tokens_text)
    m = re.match(ABSTRACT_REGEX, text, re.IGNORECASE)
    if not m:
        LOGGER.debug('text does not match regex: %r', text)
        return layout_block
    start = m.start(1)
    LOGGER.debug('start: %d (text: %r)', start, text)
    return LayoutBlock.for_tokens(list(
        layout_tokens_text.iter_layout_tokens_between(start, len(text))
    ))
 def test_should_create_lines_based_on_line_descriptor(self):
     line_descriptor_1 = LayoutLineDescriptor(line_id=1)
     line_descriptor_2 = LayoutLineDescriptor(line_id=2)
     line_tokens_1 = [
         LayoutToken(text, line_descriptor=line_descriptor_1)
         for text in ['token1.1', 'token1.2']
     ]
     line_tokens_2 = [
         LayoutToken(text, line_descriptor=line_descriptor_2)
         for text in ['token2.1', 'token2.2']
     ]
     layout_block = LayoutBlock.for_tokens(line_tokens_1 + line_tokens_2)
     assert len(layout_block.lines) == 2
     assert layout_block.lines[0].tokens == line_tokens_1
     assert layout_block.lines[1].tokens == line_tokens_2
Esempio n. 13
0
def get_regex_cleaned_layout_block_with_prefix_suffix(
    layout_block: LayoutBlock, regex_pattern: Optional[str]
) -> Tuple[LayoutBlock, LayoutBlock, LayoutBlock]:
    if not layout_block or not layout_block.lines or not regex_pattern:
        return EMPTY_BLOCK, layout_block, EMPTY_BLOCK
    layout_tokens_text = LayoutTokensText(layout_block)
    text = str(layout_tokens_text)
    m = re.match(regex_pattern, text, re.IGNORECASE)
    if not m:
        LOGGER.debug('text does not match regex: %r', text)
        return EMPTY_BLOCK, layout_block, EMPTY_BLOCK
    start = m.start(1)
    end = m.end(1)
    LOGGER.debug('start: %d, end: %d, len: %d (text: %r)', start, end,
                 len(text), text)
    return (LayoutBlock.for_tokens(
        list(layout_tokens_text.iter_layout_tokens_between(0, start))),
            LayoutBlock.for_tokens(
                list(layout_tokens_text.iter_layout_tokens_between(start,
                                                                   end))),
            LayoutBlock.for_tokens(
                list(
                    layout_tokens_text.iter_layout_tokens_between(
                        end, len(text)))))
Esempio n. 14
0
 def test_should_be_able_to_set_title_with_italic_layout_tokens(self):
     title_block = LayoutBlock.for_tokens([
         LayoutToken('rend'),
         LayoutToken('italic1', font=ITALICS_FONT_1),
         LayoutToken('test')
     ])
     document = TeiDocument()
     document.set_title_layout_block(title_block)
     LOGGER.debug('xml: %r', etree.tostring(document.root))
     nodes = document.root.xpath(
         '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]',
         namespaces=TEI_NS_MAP)
     assert len(nodes) == 1
     title_node = nodes[0]
     assert get_tei_xpath_text_content_list(
         title_node, './tei:hi[@rend="italic"]') == ['italic1']
     assert document.get_title() == 'rend italic1 test'
 def test_should_select_tokens_based_on_index(self):
     token_1 = LayoutToken(text='token1', whitespace=' ')
     token_2 = LayoutToken(text='token2', whitespace=' ')
     layout_tokens_text = LayoutTokensText(
         LayoutBlock.for_tokens([token_1, token_2]))
     assert str(layout_tokens_text) == 'token1 token2'
     assert layout_tokens_text.get_layout_tokens_between(0, 1) == [token_1]
     assert layout_tokens_text.get_layout_tokens_between(
         len(token_1.text) - 1, len(token_1.text)) == [token_1]
     assert not layout_tokens_text.get_layout_tokens_between(
         len(token_1.text),
         len(token_1.text) + 1)
     assert layout_tokens_text.get_layout_tokens_between(
         len(token_1.text) + 1,
         len(token_1.text) + 2) == [token_2]
     assert layout_tokens_text.get_layout_tokens_between(
         len(token_1.text) + 1 + len(token_2.text) - 1,
         len(token_1.text) + 1 + len(token_2.text)) == [token_2]
     assert not layout_tokens_text.get_layout_tokens_between(
         len(token_1.text) + 1 + len(token_2.text),
         len(token_1.text) + 1 + len(token_2.text) + 1)
Esempio n. 16
0
def get_layout_page_with_text_or_graphic_replaced_by_graphic(
        layout_page: LayoutPage, semantic_graphic: SemanticGraphic,
        is_only_semantic_graphic_on_page: bool,
        is_replace_overlapping_text: bool) -> LayoutPage:
    layout_graphic = semantic_graphic.layout_graphic
    assert layout_graphic
    assert layout_graphic.coordinates
    graphic_bounding_box = layout_graphic.coordinates.bounding_box
    if is_only_semantic_graphic_on_page:
        layout_graphic = layout_graphic._replace(
            related_block=LayoutBlock.for_tokens(
                list(layout_page.iter_all_tokens())))
    modified_layout_page = (layout_page.replace(graphics=[
        _layout_graphic for _layout_graphic in layout_page.graphics
        if not is_layout_graphic_within_bounding_box(
            _layout_graphic, bounding_box=graphic_bounding_box)
    ] + [layout_graphic]))
    if is_replace_overlapping_text:
        modified_layout_page = (modified_layout_page.flat_map_layout_tokens(
            functools.partial(
                _remove_tokens_within_bounding_box_flatmap_fn,
                bounding_box=graphic_bounding_box)).remove_empty_blocks())
    return modified_layout_page
Esempio n. 17
0
def _get_semantic_content_for_page_coordinates(
        coordinates: LayoutPageCoordinates) -> SemanticContentWrapper:
    return SemanticFigure(layout_block=LayoutBlock.for_tokens(
        [LayoutToken(text='dummy', coordinates=coordinates)]))