Beispiel #1
0
 def test_should_find_bbox_and_map_to_page_coordinates(  # pylint: disable=too-many-locals
     self,
     computer_vision_model_mock: MagicMock,
     tmp_path: Path,
     extract_graphic_assets: bool
 ):
     image_path = tmp_path / 'page10.png'
     image = PIL.Image.new('RGB', size=(20, 10), color=(255, 0, 0))
     image.save(image_path)
     page_images = [DocumentPageImage(
         page_number=10,
         page_image_path=str(image_path)
     )]
     layout_document = LayoutDocument(pages=[
         _create_page(
             coordinates=LayoutPageCoordinates(
                 x=0, y=0, width=200, height=100, page_number=10
             )
         )
     ])
     cv_result = computer_vision_model_mock.predict_single.return_value
     cv_bbox = BoundingBox(x=1, y=2, width=3, height=4)
     cv_result.get_instances_by_type_name.return_value = [
         SimpleComputerVisionModelInstance(bounding_box=cv_bbox)
     ]
     expected_page_coordinates = LayoutPageCoordinates(
         x=10, y=20, width=30, height=40, page_number=10
     )
     graphic_provider = ComputerVisionDocumentGraphicProvider(
         computer_vision_model=computer_vision_model_mock,
         page_image_iterable=page_images,
         temp_dir=str(tmp_path)
     )
     semantic_graphic_list = list(graphic_provider.iter_semantic_graphic_for_layout_document(
         layout_document=layout_document,
         extract_graphic_assets=extract_graphic_assets
     ))
     assert semantic_graphic_list
     semantic_graphic = semantic_graphic_list[0]
     LOGGER.debug('semantic_graphic: %s', semantic_graphic)
     layout_graphic = semantic_graphic.layout_graphic
     assert layout_graphic is not None
     assert layout_graphic.coordinates == expected_page_coordinates
     if extract_graphic_assets:
         assert layout_graphic.local_file_path
         assert (
             semantic_graphic.relative_path
             == os.path.basename(layout_graphic.local_file_path)
         )
         with PIL.Image.open(layout_graphic.local_file_path) as cropped_image:
             assert cropped_image.width == cv_bbox.width
             assert cropped_image.height == cv_bbox.height
     else:
         assert not semantic_graphic.relative_path
 def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics(
         self):
     page_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(0, 0, 200, 200), page_number=1)
     semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 90, 100, 50), page_number=1)
     keep_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 10, 100, 20), page_number=1)
     remove_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 100, 20), page_number=1)
     empty_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 0, 0), page_number=1)
     keep_token = LayoutToken('keep', coordinates=keep_coordinates)
     remove_token = LayoutToken('remove', coordinates=remove_coordinates)
     keep_graphic = LayoutGraphic(coordinates=keep_coordinates,
                                  graphic_type='keep-graphic')
     remove_graphic = LayoutGraphic(coordinates=remove_coordinates,
                                    graphic_type='remove-graphic')
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(
                 lines=[LayoutLine(tokens=[keep_token, remove_token])])
         ],
                    graphics=[keep_graphic, remove_graphic],
                    meta=LayoutPageMeta(
                        page_number=page_coordinates.page_number,
                        coordinates=page_coordinates))
     ])
     layout_graphic = LayoutGraphic(
         coordinates=semantic_graphic_coordinates,
         graphic_type='new-graphic')
     no_coords_layout_graphic = LayoutGraphic(
         coordinates=empty_coordinates, graphic_type='empty-coords-graphic')
     result = get_layout_document_with_text_and_graphics_replaced_by_graphics(
         layout_document,
         semantic_graphics=[
             SemanticGraphic(layout_graphic=layout_graphic),
             SemanticGraphic(layout_graphic=no_coords_layout_graphic)
         ])
     LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics)
     assert result.pages[0].graphics[:-1] == [keep_graphic]
     LOGGER.debug('result.pages[0].graphics[-1]: %r',
                  result.pages[0].graphics[-1])
     assert result.pages[0].graphics[
         -1].graphic_type == layout_graphic.graphic_type
     assert result.pages[0].graphics[
         -1].coordinates == layout_graphic.coordinates
     assert list(
         result.pages[0].blocks[0].iter_all_tokens()) == [keep_token]
     assert list(
         result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [
             keep_token, remove_token
         ]
Beispiel #3
0
 def test_should_detect_indented_blocks(self):
     line_indentation_status_feature = LineIndentationStatusFeature()
     line_indentation_status_feature.on_new_block()
     line_indentation_status_feature.on_new_line()
     assert line_indentation_status_feature.get_is_indented_and_update(
         LayoutToken('x',
                     coordinates=LayoutPageCoordinates(
                         x=10, y=10, width=10, height=10))) is False
     line_indentation_status_feature.on_new_line()
     assert line_indentation_status_feature.get_is_indented_and_update(
         LayoutToken('x',
                     coordinates=LayoutPageCoordinates(
                         x=50, y=10, width=10, height=10))) is True
 def test_should_not_merge_coordinates_on_different_pages(self):
     coordinates_list = [
         LayoutPageCoordinates(x=10,
                               y=10,
                               width=100,
                               height=100,
                               page_number=1),
         LayoutPageCoordinates(x=110,
                               y=10,
                               width=100,
                               height=100,
                               page_number=2)
     ]
     assert get_merged_coordinates_list(
         coordinates_list) == coordinates_list
 def test_should_retokenize_document_with_placeholders(self):
     text = 'token1 token2'
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_tokens([
                 LayoutToken(text,
                             whitespace='\n',
                             coordinates=LayoutPageCoordinates(
                                 x=10, y=10, width=100, height=50))
             ])
         ],
                    graphics=[])
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     line = retokenized_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1', 'token2']
     assert [t.whitespace for t in line.tokens] == [' ', '\n']
     assert line.tokens[0].coordinates.x == 10.0
     assert line.tokens[0].coordinates.width == 100 * len('token1') / len(
         text)
     assert line.tokens[
         1].coordinates.x == 10.0 + 100 * len('token1 ') / len(text)
     assert line.tokens[1].coordinates.width == 100 * len('token2') / len(
         text)
Beispiel #6
0
 def parse_page_coordinates(self, node: etree.ElementBase,
                            page_number: int) -> LayoutPageCoordinates:
     return LayoutPageCoordinates(x=float(node.attrib.get('HPOS', 0)),
                                  y=float(node.attrib.get('VPOS', 0)),
                                  width=float(node.attrib.get('WIDTH', 0)),
                                  height=float(node.attrib.get('HEIGHT',
                                                               0)),
                                  page_number=page_number)
Beispiel #7
0
 def iter_semantic_graphic_for_image(  # pylint: disable=too-many-locals
         self, image: PIL.Image.Image, extract_graphic_assets: bool,
         page_number: int,
         page: Optional[LayoutPage]) -> Iterable[SemanticGraphic]:
     LOGGER.debug('image size: %d x %d', image.width, image.height)
     page_coordinates = (page.meta.coordinates
                         if page is not None else None)
     page_graphics = (page.graphics if page is not None else [])
     cv_start = monotonic()
     cv_result = self.computer_vision_model.predict_single(image)
     cv_end = monotonic()
     figure_instances = cv_result.get_instances_by_type_name('Figure')
     figure_coordinates_list = [
         instance.get_bounding_box() for instance in figure_instances
     ]
     LOGGER.info(
         'cv result, took=%.3fs, page_number=%d, image_size=%dx%d, figure_coordinates_list=%r',
         cv_end - cv_start, page_number, image.width, image.height,
         figure_coordinates_list)
     for figure_index, figure_coordinates in enumerate(
             figure_coordinates_list):
         figure_number = 1 + figure_index
         local_image_path: Optional[str] = None
         relative_image_path: Optional[str] = None
         scaled_figure_coordinates = figure_coordinates
         if page_coordinates:
             scaled_figure_coordinates = (figure_coordinates.scale_by(
                 page_coordinates.width / image.width,
                 page_coordinates.height / image.height))
         matching_layout_graphic = get_layout_graphic_with_similar_coordinates(
             page_graphics=page_graphics,
             bounding_box=scaled_figure_coordinates,
             ignored_graphic_types=self.ignored_graphic_types)
         if matching_layout_graphic is not None:
             yield get_semantic_graphic_for_layout_graphic(
                 matching_layout_graphic,
                 extract_graphic_assets=extract_graphic_assets)
             continue
         if extract_graphic_assets:
             local_image_path = os.path.join(
                 self.temp_dir, f'figure-{page_number}-{figure_number}.png')
             relative_image_path = os.path.basename(local_image_path)
             cropped_image = get_cropped_image(image, figure_coordinates)
             cropped_image.save(local_image_path)
         layout_graphic = LayoutGraphic(coordinates=LayoutPageCoordinates(
             x=scaled_figure_coordinates.x,
             y=scaled_figure_coordinates.y,
             width=scaled_figure_coordinates.width,
             height=scaled_figure_coordinates.height,
             page_number=page_number),
                                        graphic_type='cv-figure',
                                        local_file_path=local_image_path)
         semantic_graphic = SemanticGraphic(
             layout_graphic=layout_graphic,
             relative_path=relative_image_path)
         yield semantic_graphic
Beispiel #8
0
 def test_should_return_the_best_matching_graphic(
     self
 ):
     page_graphics = [
         LayoutGraphic(coordinates=LayoutPageCoordinates(
             x=10, y=10, width=200, height=100
         )),
         LayoutGraphic(coordinates=LayoutPageCoordinates(
             x=10, y=10, width=100, height=100
         )),
         LayoutGraphic(coordinates=LayoutPageCoordinates(
             x=100, y=10, width=100, height=100
         )),
     ]
     result = get_layout_graphic_with_similar_coordinates(
         page_graphics,
         BoundingBox(x=10, y=10, width=90, height=100)
     )
     assert result == page_graphics[1]
Beispiel #9
0
 def test_should_prefer_embedded_graphic(  # pylint: disable=too-many-locals
     self,
     computer_vision_model_mock: MagicMock,
     tmp_path: Path
 ):
     image_path = tmp_path / 'page10.png'
     image = PIL.Image.new('RGB', size=(20, 10), color=(255, 0, 0))
     image.save(image_path)
     page_images = [DocumentPageImage(
         page_number=10,
         page_image_path=str(image_path)
     )]
     embedded_graphic = LayoutGraphic(
         coordinates=LayoutPageCoordinates(
             x=10, y=20, width=30, height=40, page_number=10
         )
     )
     layout_document = LayoutDocument(pages=[
         _create_page(
             coordinates=LayoutPageCoordinates(
                 x=0, y=0, width=200, height=100, page_number=10
             ),
             graphics=[embedded_graphic]
         )
     ])
     cv_result = computer_vision_model_mock.predict_single.return_value
     cv_bbox = BoundingBox(x=1, y=2, width=3, height=4)
     cv_result.get_instances_by_type_name.return_value = [
         SimpleComputerVisionModelInstance(bounding_box=cv_bbox)
     ]
     graphic_provider = ComputerVisionDocumentGraphicProvider(
         computer_vision_model=computer_vision_model_mock,
         page_image_iterable=page_images,
         temp_dir=str(tmp_path)
     )
     semantic_graphic_list = list(graphic_provider.iter_semantic_graphic_for_layout_document(
         layout_document=layout_document,
         extract_graphic_assets=True
     ))
     assert semantic_graphic_list
     semantic_graphic = semantic_graphic_list[0]
     LOGGER.debug('semantic_graphic: %s', semantic_graphic)
     assert semantic_graphic.layout_graphic == embedded_graphic
 def test_should_merge_coordinates_on_same_line(self):
     assert get_merged_coordinates_list([
         LayoutPageCoordinates(x=10,
                               y=10,
                               width=100,
                               height=100,
                               page_number=1),
         LayoutPageCoordinates(x=110,
                               y=10,
                               width=100,
                               height=100,
                               page_number=1)
     ]) == [
         LayoutPageCoordinates(x=10,
                               y=10,
                               width=110 - 10 + 100,
                               height=100,
                               page_number=1)
     ]
 def test_should_merge_coordinates_above_each_other(self):
     assert get_merged_coordinates_list([
         LayoutPageCoordinates(x=10,
                               y=10,
                               width=100,
                               height=100,
                               page_number=1),
         LayoutPageCoordinates(x=10,
                               y=110,
                               width=100,
                               height=100,
                               page_number=1)
     ]) == [
         LayoutPageCoordinates(x=10,
                               y=10,
                               width=100,
                               height=110 - 10 + 100,
                               page_number=1)
     ]
Beispiel #12
0
 def test_should_parse_page_meta_data(self):
     page = AltoParser().parse_page(
         ALTO_E.Page(
             {'PHYSICAL_IMG_NR': '10', 'WIDTH': '101', 'HEIGHT': '102'},
             ALTO_E.PrintSpace(),
         ),
         page_index=0
     )
     assert page.meta.page_number == 10
     assert page.meta.coordinates == LayoutPageCoordinates(
         x=0, y=0, width=101, height=102, page_number=10
     )
Beispiel #13
0
 def test_should_ignore_matches_below_threshold(
     self
 ):
     page_graphics = [
         LayoutGraphic(coordinates=LayoutPageCoordinates(
             x=10, y=10, width=100, height=100
         ))
     ]
     result = get_layout_graphic_with_similar_coordinates(
         page_graphics,
         BoundingBox(x=10, y=10, width=10, height=1000)
     )
     assert result is None
Beispiel #14
0
 def test_should_ignore_svg_graphics(
     self
 ):
     page_graphics = [
         LayoutGraphic(coordinates=LayoutPageCoordinates.from_bounding_box(
             BOUNDING_BOX_1
         ), graphic_type='svg')
     ]
     result = get_layout_graphic_with_similar_coordinates(
         page_graphics,
         BOUNDING_BOX_1,
         ignored_graphic_types={'svg'}
     )
     assert result is None
Beispiel #15
0
 def parse_page(self, page_node: etree.ElementBase,
                page_index: int) -> LayoutPage:
     page_number_str = page_node.attrib.get('PHYSICAL_IMG_NR')
     page_number = int(
         page_number_str) if page_number_str else 1 + page_index
     width_str = page_node.attrib.get('WIDTH')
     height_str = page_node.attrib.get('HEIGHT')
     coordinates = (LayoutPageCoordinates(x=0,
                                          y=0,
                                          width=float(width_str),
                                          height=float(height_str),
                                          page_number=page_number)
                    if width_str and height_str else None)
     return LayoutPage(
         meta=LayoutPageMeta(page_number=page_number,
                             coordinates=coordinates),
         blocks=[
             self.parse_block(block_node, page_number=page_number)
             for block_node in alto_xpath(page_node, './/alto:TextBlock')
         ],
         graphics=[
             self.parse_graphic(graphic_node, page_number=page_number) for
             graphic_node in alto_xpath(page_node, './/alto:Illustration')
         ])
from sciencebeam_parser.utils.bounding_box import BoundingBox
from sciencebeam_parser.document.layout_document import (
    LayoutBlock, LayoutDocument, LayoutGraphic, LayoutLine, LayoutPage,
    LayoutPageCoordinates, LayoutPageMeta, LayoutToken)
from sciencebeam_parser.processors.graphic_provider import (
    SimpleDocumentGraphicProvider,
    get_layout_document_with_graphics_replaced_by_graphics,
    get_layout_document_with_text_and_graphics_replaced_by_graphics,
    get_page_numbers_with_mostly_bitmap_graphics,
    get_page_numbers_with_uncommon_page_dimension)

LOGGER = logging.getLogger(__name__)

LAYOUT_PAGE_COORDINATES_1 = LayoutPageCoordinates(x=10,
                                                  y=11,
                                                  width=100,
                                                  height=101,
                                                  page_number=1)

LAYOUT_PAGE_COORDINATES_2 = LayoutPageCoordinates(x=10,
                                                  y=11,
                                                  width=200,
                                                  height=101,
                                                  page_number=2)


def _get_layout_document_for_layout_graphic(
        layout_graphic: LayoutGraphic) -> LayoutDocument:
    return LayoutDocument(
        pages=[LayoutPage(blocks=[], graphics=[layout_graphic])])
Beispiel #17
0
FONTSIZE_1 = 11.1

FONT_ID_2 = 'font2'
FONTFAMILY_2 = 'fontfamily2'
FONTSIZE_2 = 22.2

BOLD = 'bold'
ITALICS = 'italics'
SUBSCRIPT = 'subscript'
SUPERSCRIPT = 'superscript'

TOKEN_1 = 'token1'
TOKEN_2 = 'token2'

COORDINATES_1 = LayoutPageCoordinates(
    x=100.1, y=101.1, width=102.2, height=103.3, page_number=1
)

COORDINATES_2 = LayoutPageCoordinates(
    x=200.1, y=201.1, width=202.2, height=203.3, page_number=1
)


class TestAltoParser:
    def test_should_parse_font_without_fontstyle(self):
        font = AltoParser().parse_font(ALTO_E.TextStyle(
            ID=FONT_ID_1,
            FONTFAMILY=FONTFAMILY_1,
            FONTSIZE=str(FONTSIZE_1)
        ))
        assert font.is_bold is False
from sciencebeam_parser.document.layout_document import (
    EMPTY_BLOCK, LayoutLineDescriptor, LayoutPageCoordinates, LayoutPageMeta,
    LayoutToken, LayoutLine, LayoutBlock, LayoutPage, LayoutDocument,
    LayoutTokensText, get_merged_coordinates_list, retokenize_layout_document,
    remove_empty_blocks)

COORDINATES_1 = LayoutPageCoordinates(x=10,
                                      y=10,
                                      width=100,
                                      height=100,
                                      page_number=1)


class TestGetMergedCoordinatesList:
    def test_should_merge_coordinates_on_same_line(self):
        assert get_merged_coordinates_list([
            LayoutPageCoordinates(x=10,
                                  y=10,
                                  width=100,
                                  height=100,
                                  page_number=1),
            LayoutPageCoordinates(x=110,
                                  y=10,
                                  width=100,
                                  height=100,
                                  page_number=1)
        ]) == [
            LayoutPageCoordinates(x=10,
                                  y=10,
                                  width=110 - 10 + 100,
                                  height=100,
Beispiel #19
0
    def test_should_extract_figure_label_caption_from_body(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels,
            segmentation_label: str):
        citation_block = LayoutBlock.for_text('Figure 1')
        _coordinates = LayoutPageCoordinates(x=10, y=10, width=100, height=10)
        graphic_local_file_path = '/path/to/graphic1.svg'
        graphic = LayoutGraphic(coordinates=_coordinates,
                                local_file_path=graphic_local_file_path)
        _coordinates = _coordinates.move_by(dy=10)
        label_block = LayoutBlock.for_text('Figure 1',
                                           coordinates=_coordinates)
        _coordinates = _coordinates.move_by(dy=10)
        caption_block = LayoutBlock.for_text('Caption 1',
                                             coordinates=_coordinates)
        other_block = LayoutBlock.for_text('Other')
        figure_block = LayoutBlock.merge_blocks(
            [label_block, other_block, caption_block])
        fulltext_block = LayoutBlock.merge_blocks(
            [citation_block, figure_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_figure_fields=True,
                                    extract_graphic_bounding_boxes=True,
                                    extract_graphic_assets=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        figure_model_mock = fulltext_models_mock.figure_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            fulltext_block, segmentation_label)

        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<figure_marker>')
        fulltext_model_mock.update_label_by_layout_block(
            figure_block, '<figure>')

        figure_model_mock.update_label_by_layout_block(label_block, '<label>')
        figure_model_mock.update_label_by_layout_block(caption_block,
                                                       '<figDesc>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[fulltext_block], graphics=[graphic])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        figure_list = list(
            iter_by_semantic_type_recursively([
                semantic_document.body_section, semantic_document.back_section
            ], SemanticFigure))
        assert len(figure_list) == 1
        figure = figure_list[0]
        assert figure.get_text_by_type(SemanticLabel) == label_block.text
        assert figure.get_text_by_type(SemanticCaption) == caption_block.text
        assert figure.content_id == 'fig_0'
        figure_citation_list = list(
            semantic_document.iter_by_type_recursively(SemanticFigureCitation))
        assert len(figure_citation_list) == 1
        assert figure_citation_list[0].get_text() == citation_block.text
        assert figure_citation_list[0].target_content_id == 'fig_0'
        semantic_graphic_list = list(figure.iter_by_type(SemanticGraphic))
        assert semantic_graphic_list
        assert semantic_graphic_list[0].layout_graphic == graphic
        assert semantic_graphic_list[0].relative_path == os.path.basename(
            graphic_local_file_path)
Beispiel #20
0
from sciencebeam_parser.document.layout_document import (LayoutBlock,
                                                         LayoutGraphic,
                                                         LayoutPageCoordinates,
                                                         LayoutToken)
from sciencebeam_parser.document.semantic_document import (
    SemanticContentWrapper, SemanticFigure, SemanticGraphic, SemanticLabel,
    SemanticMixedContentWrapper)
from sciencebeam_parser.processors.graphic_matching import (
    BoundingBoxDistanceGraphicMatcher, GraphicRelatedBlockTextGraphicMatcher,
    OpticalCharacterRecognitionGraphicMatcher, get_bounding_box_list_distance)

LOGGER = logging.getLogger(__name__)

COORDINATES_1 = LayoutPageCoordinates(x=10,
                                      y=100,
                                      width=200,
                                      height=100,
                                      page_number=1)

GRAPHIC_ABOVE_FIGURE_COORDINATES_1 = LayoutPageCoordinates(x=10,
                                                           y=100,
                                                           width=200,
                                                           height=100,
                                                           page_number=1)

FIGURE_BELOW_GRAPHIC_COORDINATES_1 = LayoutPageCoordinates(
    x=10,
    y=GRAPHIC_ABOVE_FIGURE_COORDINATES_1.y +
    GRAPHIC_ABOVE_FIGURE_COORDINATES_1.height + 10,
    width=200,
    height=20,
    SemanticNameSuffix, SemanticNameTitle, SemanticParagraph,
    SemanticRawEquation, SemanticRawEquationContent, SemanticRawReference,
    SemanticRawReferenceText, SemanticReference, SemanticReferenceCitation,
    SemanticReferenceList, SemanticSection, SemanticSectionTypes,
    SemanticSurname, SemanticTable, SemanticTableCitation,
    SemanticTextContentWrapper, SemanticTitle)
from sciencebeam_parser.document.tei_document import (
    get_tei_for_semantic_document)
from tests.document.tei.common_test import (TOKEN_1, TOKEN_2)

LOGGER = logging.getLogger(__name__)

WEB_URL_1 = 'http://host/path'
DOI_1 = '10.1234/test'

COORDINATES_1 = LayoutPageCoordinates(10, 20, 110, 120)


class TestGetTeiForSemanticDocument:  # pylint: disable=too-many-public-methods
    def test_should_return_empty_document(self):
        semantic_document = SemanticDocument()
        tei_document = get_tei_for_semantic_document(semantic_document)
        assert not tei_document.xpath('//tei:div')

    def test_should_set_manuscript_title(self):
        semantic_document = SemanticDocument()
        semantic_document.front.add_content(
            SemanticTitle(layout_block=LayoutBlock.for_text(TOKEN_1)))
        tei_document = get_tei_for_semantic_document(semantic_document)
        LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
        assert tei_document.get_xpath_text_content_list(