Beispiel #1
0
    def test_should_extract_references_fields_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        other_body = LayoutBlock.for_text('the body')
        citation_block = LayoutBlock.for_text('1')
        body_block = LayoutBlock.merge_blocks([other_body, citation_block])
        label_block = LayoutBlock.for_text('1')
        ref_title_block = LayoutBlock.for_text('Reference Title 1')
        ref_text_block = LayoutBlock.merge_blocks([ref_title_block])
        ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            body_block, '<body>')
        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        fulltext_model_mock.update_label_by_layout_block(
            other_body, '<section>')
        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<citation_marker>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            label_block, '<label>')
        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            ref_title_block, '<title>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[body_block, ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticReference))
        assert len(references) == 1
        ref = references[0]
        assert ref.get_text_by_type(SemanticTitle) == ref_title_block.text
        assert ref.get_text_by_type(SemanticLabel) == label_block.text
        assert ref.get_text_by_type(
            SemanticRawReferenceText) == ref_text_block.text
        assert ref.content_id == 'b0'
        ref_citations = list(
            semantic_document.iter_by_type_recursively(
                SemanticReferenceCitation))
        assert len(ref_citations) == 1
        assert ref_citations[0].target_content_id == 'b0'
Beispiel #2
0
    def test_should_extract_table_label_caption_from_body(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels,
            segmentation_label: str):
        citation_block = LayoutBlock.for_text('Table 1')
        label_block = LayoutBlock.for_text('Table 1')
        caption_block = LayoutBlock.for_text('Caption 1')
        other_block = LayoutBlock.for_text('Other')
        figure_block = LayoutBlock.merge_blocks(
            [label_block, other_block, caption_block])
        fulltext_block = LayoutBlock.merge_blocks(
            [citation_block, figure_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_table_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        table_model_mock = fulltext_models_mock.table_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            fulltext_block, segmentation_label)

        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<table_marker>')
        fulltext_model_mock.update_label_by_layout_block(
            figure_block, '<table>')

        table_model_mock.update_label_by_layout_block(label_block, '<label>')
        table_model_mock.update_label_by_layout_block(caption_block,
                                                      '<figDesc>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[fulltext_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        table_list = list(
            iter_by_semantic_type_recursively([
                semantic_document.body_section, semantic_document.back_section
            ], SemanticTable))
        assert len(table_list) == 1
        table = table_list[0]
        assert table.get_text_by_type(SemanticLabel) == label_block.text
        assert table.get_text_by_type(SemanticCaption) == caption_block.text
        assert table.content_id == 'tab_0'
        table_citation_list = list(
            semantic_document.iter_by_type_recursively(SemanticTableCitation))
        assert len(table_citation_list) == 1
        assert table_citation_list[0].get_text() == citation_block.text
        assert table_citation_list[0].target_content_id == 'tab_0'
Beispiel #3
0
    def test_should_extract_editor_names_from_references_fields(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        given_name_block = LayoutBlock.for_text('Given name')
        surname_block = LayoutBlock.for_text('Surname')
        other_block = LayoutBlock.for_text('Other')
        editors_block = LayoutBlock.merge_blocks(
            [given_name_block, other_block, surname_block])
        ref_text_block = LayoutBlock.merge_blocks([editors_block])
        ref_block = LayoutBlock.merge_blocks([ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True,
                                    extract_citation_authors=False,
                                    extract_citation_editors=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock
        name_citation_model_mock = fulltext_models_mock.name_citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            editors_block, '<editor>')

        name_citation_model_mock.update_label_by_layout_block(
            given_name_block, '<forename>')
        name_citation_model_mock.update_label_by_layout_block(
            surname_block, '<surname>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticReference))
        assert len(references) == 1
        ref = references[0]
        editors = list(ref.iter_by_type(SemanticEditor))
        assert len(editors) == 1
        assert editors[0].given_name_text == given_name_block.text
        assert editors[0].surname_text == surname_block.text
Beispiel #4
0
    def test_should_extract_invalid_reference_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        other_body = LayoutBlock.for_text('the body')
        citation_block = LayoutBlock.for_text('1')
        body_block = LayoutBlock.merge_blocks([other_body, citation_block])
        invalid_reference_block = LayoutBlock.for_text(
            'This is an invalid reference 1')
        ref_text_block = invalid_reference_block
        ref_block = ref_text_block
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            body_block, '<body>')
        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        fulltext_model_mock.update_label_by_layout_block(
            other_body, '<section>')
        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<citation_marker>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            invalid_reference_block, 'O')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[body_block, ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(
            reference_list[0].iter_by_type(SemanticInvalidReference))
        assert len(references) == 1
        assert references[0].get_text() == invalid_reference_block.text
Beispiel #5
0
    def test_should_extract_author_names_separated_by_another_tag(
            self, fulltext_models_mock: MockFullTextModels):
        given_name_block = LayoutBlock.for_text('Given name')
        surname_block = LayoutBlock.for_text('Surname')
        other_block = LayoutBlock.for_text('Other')
        authors_block = LayoutBlock.merge_blocks(
            [given_name_block, other_block, surname_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            config=FullTextProcessorConfig(merge_raw_authors=True))
        header_block = authors_block

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        name_header_model_mock = fulltext_models_mock.name_header_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')

        header_model_mock.update_label_by_layout_block(given_name_block,
                                                       '<author>')
        header_model_mock.update_label_by_layout_block(surname_block,
                                                       '<author>')

        name_header_model_mock.update_label_by_layout_block(
            given_name_block, '<forename>')
        name_header_model_mock.update_label_by_layout_block(
            surname_block, '<surname>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[header_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        authors = semantic_document.front.authors
        assert len(authors) == 1
        assert authors[0].given_name_text == given_name_block.text
        assert authors[0].surname_text == surname_block.text
Beispiel #6
0
    def test_should_extract_raw_references_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        label_block = LayoutBlock.for_text('1')
        ref_text_block = LayoutBlock.for_text('Reference 1')
        ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=False))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            label_block, '<label>')
        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        assert semantic_document.back_section.get_text() == ref_block.text
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticRawReference))
        assert len(references) == 1
        ref = references[0]
        assert ref.get_text_by_type(SemanticLabel) == label_block.text
        assert ref.get_text_by_type(
            SemanticRawReferenceText) == ref_text_block.text
        assert ref.content_id == 'b0'
Beispiel #7
0
    def test_should_extract_figure_label_caption_from_body(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels,
            segmentation_label: str):
        citation_block = LayoutBlock.for_text('Figure 1')
        _coordinates = LayoutPageCoordinates(x=10, y=10, width=100, height=10)
        graphic_local_file_path = '/path/to/graphic1.svg'
        graphic = LayoutGraphic(coordinates=_coordinates,
                                local_file_path=graphic_local_file_path)
        _coordinates = _coordinates.move_by(dy=10)
        label_block = LayoutBlock.for_text('Figure 1',
                                           coordinates=_coordinates)
        _coordinates = _coordinates.move_by(dy=10)
        caption_block = LayoutBlock.for_text('Caption 1',
                                             coordinates=_coordinates)
        other_block = LayoutBlock.for_text('Other')
        figure_block = LayoutBlock.merge_blocks(
            [label_block, other_block, caption_block])
        fulltext_block = LayoutBlock.merge_blocks(
            [citation_block, figure_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_figure_fields=True,
                                    extract_graphic_bounding_boxes=True,
                                    extract_graphic_assets=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        figure_model_mock = fulltext_models_mock.figure_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            fulltext_block, segmentation_label)

        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<figure_marker>')
        fulltext_model_mock.update_label_by_layout_block(
            figure_block, '<figure>')

        figure_model_mock.update_label_by_layout_block(label_block, '<label>')
        figure_model_mock.update_label_by_layout_block(caption_block,
                                                       '<figDesc>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[fulltext_block], graphics=[graphic])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        figure_list = list(
            iter_by_semantic_type_recursively([
                semantic_document.body_section, semantic_document.back_section
            ], SemanticFigure))
        assert len(figure_list) == 1
        figure = figure_list[0]
        assert figure.get_text_by_type(SemanticLabel) == label_block.text
        assert figure.get_text_by_type(SemanticCaption) == caption_block.text
        assert figure.content_id == 'fig_0'
        figure_citation_list = list(
            semantic_document.iter_by_type_recursively(SemanticFigureCitation))
        assert len(figure_citation_list) == 1
        assert figure_citation_list[0].get_text() == citation_block.text
        assert figure_citation_list[0].target_content_id == 'fig_0'
        semantic_graphic_list = list(figure.iter_by_type(SemanticGraphic))
        assert semantic_graphic_list
        assert semantic_graphic_list[0].layout_graphic == graphic
        assert semantic_graphic_list[0].relative_path == os.path.basename(
            graphic_local_file_path)