def test_should_add_parsed_references(self):
     semantic_document = SemanticDocument()
     semantic_ref = SemanticReference([
         SemanticTitle(
             layout_block=LayoutBlock.for_text('Reference Title 1')),
         SemanticRawReferenceText(
             layout_block=LayoutBlock.for_text('Reference 1'))
     ])
     semantic_ref.content_id = 'b0'
     semantic_document.back_section.add_content(
         SemanticReferenceList([
             SemanticHeading(
                 layout_block=LayoutBlock.for_text('References')),
             semantic_ref
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl/tei:head'
     ) == ['References']
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl'
         '/tei:biblStruct/tei:analytic/tei:title[@type="main"]') == [
             'Reference Title 1'
         ]
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl'
         '/tei:biblStruct/tei:note[@type="raw_reference"]') == [
             'Reference 1'
         ]
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl'
         '/tei:biblStruct/@xml:id') == ['b0']
 def test_should_add_section_figures_to_back(self):
     semantic_document = SemanticDocument()
     semantic_document.back_section.add_content(
         SemanticSection([
             SemanticFigure([
                 SemanticLabel(
                     layout_block=LayoutBlock.for_text('Label 1')),
                 SemanticCaption(
                     layout_block=LayoutBlock.for_text('Caption 1'))
             ],
                            content_id='fig_0')
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     figure_xpath = (
         '//tei:back/tei:div[@type="annex"]/tei:figure[not(contains(@type, "table"))]'
     )
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:head') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:label') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:figDesc') == ['Caption 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/@xml:id') == ['fig_0']
     assert not tei_document.xpath(
         '//tei:back/tei:div[@type="annex"]/tei:div')
 def test_should_add_asset_citation_for_resolved_reference(self):
     semantic_document = SemanticDocument()
     semantic_document.body_section.add_content(
         SemanticSection([
             SemanticParagraph([
                 SemanticTextContentWrapper(
                     layout_block=LayoutBlock.for_text('See')),
                 SemanticReferenceCitation(
                     layout_block=LayoutBlock.for_text('Ref 1'),
                     target_content_id='b0')
             ]),
             SemanticReferenceList([
                 SemanticReference([
                     SemanticLabel(layout_block=LayoutBlock.for_text('1'))
                 ],
                                   content_id='b0')
             ])
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p') == ['See Ref 1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]') == ['Ref 1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]/@target') == [
             '#b0'
         ]
 def test_should_add_single_author(self):
     semantic_document = SemanticDocument()
     title = SemanticNameTitle(layout_block=LayoutBlock.for_text('Title1'))
     given_name = SemanticGivenName(
         layout_block=LayoutBlock.for_text('Given1'))
     middle_name = SemanticMiddleName(
         layout_block=LayoutBlock.for_text('Middle1'))
     surname = SemanticSurname(
         layout_block=LayoutBlock.for_text('Surname1'))
     suffix = SemanticNameSuffix(
         layout_block=LayoutBlock.for_text('Suffix1'))
     author = SemanticAuthor(
         [title, given_name, middle_name, surname, suffix])
     semantic_document.front.add_content(author)
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:roleName') == ['Title1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:forename[@type="first"]') == ['Given1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:forename[@type="middle"]') == ['Middle1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:surname') == ['Surname1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:genName') == ['Suffix1']
Example #5
0
 def _process_graphics(
     self,
     document: SemanticDocument,
     layout_document: LayoutDocument,
     context: FullTextProcessorDocumentContext
 ):
     unmatched_graphics_container = SemanticMixedNote(note_type='unmatched_graphics')
     candidate_semantic_content_list = list(
         document.iter_by_type_recursively(SemanticFigure)
     )
     self._match_graphic_elements(
         semantic_graphic_list=list(
             self._get_document_graphic_provider(
                 context=context,
                 page_numbers=get_page_numbers_for_semantic_content_list(
                     candidate_semantic_content_list
                 )
             ).iter_semantic_graphic_for_layout_document(
                 layout_document,
                 extract_graphic_assets=self.config.extract_graphic_assets
             )
         ),
         candidate_semantic_content_list=candidate_semantic_content_list,
         unmatched_graphics_container=unmatched_graphics_container
     )
     if not unmatched_graphics_container.is_empty():
         LOGGER.debug('unmatched_graphics_container: %r', unmatched_graphics_container)
         document.back_section.add_content(unmatched_graphics_container)
     else:
         LOGGER.debug('no unmatched graphics')
 def test_should_set_abstract(self):
     semantic_document = SemanticDocument()
     semantic_document.front.add_content(
         SemanticAbstract(LayoutBlock.for_text(TOKEN_1)))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:abstract/tei:p') == [TOKEN_1]
 def test_should_add_notes_to_body(self):
     semantic_document = SemanticDocument()
     semantic_document.body_section.add_note(LayoutBlock.for_text(TOKEN_1),
                                             'other')
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:note[@type="other"]') == [TOKEN_1]
 def test_should_set_manuscript_title(self):
     semantic_document = SemanticDocument()
     semantic_document.front.add_content(
         SemanticTitle(layout_block=LayoutBlock.for_text(TOKEN_1)))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]'
     ) == [TOKEN_1]
 def test_should_create_back_section(self):
     semantic_document = SemanticDocument()
     section = semantic_document.back_section.add_new_section()
     section.add_heading_block(LayoutBlock.for_text(TOKEN_1))
     paragraph = section.add_new_paragraph()
     paragraph.add_block_content(LayoutBlock.for_text(TOKEN_2))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="annex"]/tei:div/tei:head') == [TOKEN_1]
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="annex"]/tei:div/tei:p') == [TOKEN_2]
Example #10
0
 def test_should_not_convert_pdf_to_jats_zip(  # pylint: disable=too-many-locals
     self,
     sciencebeam_parser_session: ScienceBeamParserSession,
     get_tei_for_semantic_document_mock: MagicMock,
     full_text_processor_class_mock: MagicMock,
     full_text_processor_mock: MagicMock,
     xslt_transformer_wrapper_mock: MagicMock,
     request_temp_path: Path
 ):
     expected_pdf_path = request_temp_path / 'test.pdf'
     expected_output_path = request_temp_path / TEMP_ALTO_XML_FILENAME
     graphic_local_file_path = request_temp_path / 'image1.png'
     graphic_relative_path = graphic_local_file_path.name
     expected_output_path.write_bytes(XML_CONTENT_1)
     graphic_local_file_path.write_bytes(IMAGE_DATA_1)
     get_tei_for_semantic_document_mock.return_value = (
         TeiDocument(etree.fromstring(TEI_XML_CONTENT_1))
     )
     xslt_transformer_wrapper_mock.return_value = (
         etree.fromstring(JATS_XML_CONTENT_1)
     )
     semantic_document = SemanticDocument()
     semantic_document.back_section.add_content(
         SemanticGraphic(
             layout_graphic=LayoutGraphic(
                 local_file_path=str(graphic_local_file_path)
             ),
             relative_path=graphic_relative_path
         )
     )
     full_text_processor_mock.get_semantic_document_for_layout_document.return_value = (
         semantic_document
     )
     result_file = (
         sciencebeam_parser_session.get_source(
             str(expected_pdf_path),
             MediaTypes.PDF
         ).get_local_file_for_response_media_type(
             MediaTypes.JATS_ZIP
         )
     )
     with ZipFile(result_file, 'r') as zip_file:
         jats_xml_data = zip_file.read('jats.xml')
         assert jats_xml_data == JATS_XML_CONTENT_1
         image_data = zip_file.read(graphic_relative_path)
         assert image_data == IMAGE_DATA_1
     full_text_processor_kwargs = full_text_processor_class_mock.call_args[1]
     full_text_processor_config = full_text_processor_kwargs['config']
     assert full_text_processor_config.extract_graphic_assets is True
 def test_should_unmatched_graphics_to_back(self):
     semantic_document = SemanticDocument()
     semantic_document.back_section.add_content(
         SemanticMixedNote([
             SemanticGraphic(
                 layout_graphic=LayoutGraphic(coordinates=COORDINATES_1),
                 relative_path='image1.svg')
         ],
                           note_type='unmatched_graphics'))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     graphics_xpath = '//tei:note[@type="unmatched_graphics"]//tei:graphic'
     assert tei_document.xpath_nodes(graphics_xpath)
     assert tei_document.get_xpath_text_content_list(
         f'{graphics_xpath}/@url') == ['image1.svg']
 def test_should_add_orphan_affiliation(self):
     semantic_document = SemanticDocument()
     aff_marker = SemanticMarker(layout_block=LayoutBlock.for_text('1'))
     institution = SemanticInstitution(
         layout_block=LayoutBlock.for_text('Institution1'))
     aff = SemanticAffiliationAddress([aff_marker, institution],
                                      content_id='aff0')
     semantic_document.front.add_content(aff)
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]'
     ) == [aff.get_text()]
     assert tei_document.get_xpath_text_content_list(
         '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]/tei:label'
     ) == [aff_marker.get_text()]
     assert tei_document.get_xpath_text_content_list(
         '//tei:author/tei:affiliation/@key') == ['aff0']
 def test_should_add_single_author_with_affiliation(self):
     semantic_document = SemanticDocument()
     title = SemanticNameTitle(layout_block=LayoutBlock.for_text('Title1'))
     given_name = SemanticGivenName(
         layout_block=LayoutBlock.for_text('Given1'))
     middle_name = SemanticMiddleName(
         layout_block=LayoutBlock.for_text('Middle1'))
     surname = SemanticSurname(
         layout_block=LayoutBlock.for_text('Surname1'))
     suffix = SemanticNameSuffix(
         layout_block=LayoutBlock.for_text('Suffix1'))
     author_marker = SemanticMarker(layout_block=LayoutBlock.for_text('1'))
     author = SemanticAuthor(
         [title, given_name, middle_name, surname, suffix, author_marker])
     aff_marker = SemanticMarker(layout_block=LayoutBlock.for_text('1'))
     institution = SemanticInstitution(
         layout_block=LayoutBlock.for_text('Institution1'))
     aff = SemanticAffiliationAddress([aff_marker, institution],
                                      content_id='aff0')
     semantic_document.front.add_content(author)
     semantic_document.front.add_content(aff)
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:roleName') == ['Title1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:forename[@type="first"]') == ['Given1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:forename[@type="middle"]') == ['Middle1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:surname') == ['Surname1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:genName') == ['Suffix1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]'
     ) == [aff.get_text()]
     assert tei_document.get_xpath_text_content_list(
         '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]/tei:label'
     ) == [aff_marker.get_text()]
     assert tei_document.get_xpath_text_content_list(
         '//tei:author/tei:affiliation/@key') == ['aff0']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author/tei:affiliation/tei:orgName[@type="institution"]'
     ) == [institution.get_text()]
Example #14
0
def create_asset_zip_for_semantic_document(zip_filename: str,
                                           semantic_document: SemanticDocument,
                                           relative_xml_filename: str,
                                           local_xml_filename: str):
    semantic_graphic_list = list(
        semantic_document.iter_by_type_recursively(SemanticGraphic))
    LOGGER.debug('semantic_graphic_list: %r', semantic_graphic_list)
    with ZipFile(zip_filename, 'w') as zip_file:
        zip_file.write(local_xml_filename, relative_xml_filename)
        for semantic_graphic in semantic_graphic_list:
            assert semantic_graphic.relative_path, \
                "graphic relative_path missing, ensure extract_graphic_assets was enabled"
            layout_graphic = semantic_graphic.layout_graphic
            assert layout_graphic
            assert layout_graphic.local_file_path
            zip_file.write(layout_graphic.local_file_path,
                           semantic_graphic.relative_path)
    LOGGER.debug('response_content (bytes): %d',
                 Path(zip_filename).stat().st_size)
 def test_should_use_author_name_part_values(self):
     semantic_document = SemanticDocument()
     given_name = SemanticGivenName(
         layout_block=LayoutBlock.for_text('GIVEN1'))
     given_name.value = 'Given1'
     middle_name = SemanticMiddleName(
         layout_block=LayoutBlock.for_text('MIDDLE1'))
     middle_name.value = 'Middle1'
     surname = SemanticSurname(
         layout_block=LayoutBlock.for_text('SURNAME1'))
     surname.value = 'Surname1'
     author = SemanticAuthor([given_name, middle_name, surname])
     semantic_document.front.add_content(author)
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:forename[@type="first"]') == ['Given1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:forename[@type="middle"]') == ['Middle1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:surname') == ['Surname1']
 def test_should_add_raw_equation_with_label_to_paragraph(self):
     # to be consistent with Java GROBID
     semantic_document = SemanticDocument()
     semantic_document.body_section.add_content(
         SemanticSection([
             SemanticParagraph([
                 SemanticTextContentWrapper(
                     layout_block=LayoutBlock.for_text('Next')),
                 SemanticRawEquation([
                     SemanticRawEquationContent(
                         layout_block=LayoutBlock.for_text('Equation 1')),
                     SemanticLabel(layout_block=LayoutBlock.for_text('(1)'))
                 ])
             ]),
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:formula') == ['Equation 1 (1)']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:formula/tei:label') == ['(1)']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p') == ['Next']
 def test_should_add_section_tables_to_body(self):
     semantic_document = SemanticDocument()
     semantic_document.body_section.add_content(
         SemanticSection([
             SemanticTable([
                 SemanticLabel(
                     layout_block=LayoutBlock.for_text('Label 1')),
                 SemanticCaption(
                     layout_block=LayoutBlock.for_text('Caption 1'))
             ],
                           content_id='tab_0')
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     table_xpath = '//tei:body/tei:figure[@type="table"]'
     assert tei_document.get_xpath_text_content_list(
         f'{table_xpath}/tei:head') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{table_xpath}/tei:label') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{table_xpath}/tei:figDesc') == ['Caption 1']
     assert tei_document.get_xpath_text_content_list(
         f'{table_xpath}/@xml:id') == ['tab_0']
     assert not tei_document.xpath('//tei:body/tei:div')
Example #18
0
    def get_semantic_document_for_layout_document(
        self,
        layout_document: LayoutDocument,
        context: Optional[FullTextProcessorDocumentContext] = None
    ) -> SemanticDocument:
        if context is None:
            context = FullTextProcessorDocumentContext()
        layout_document = self._preprocess_layout_graphics(
            layout_document,
            context=context
        )
        segmentation_label_result = self.segmentation_model.get_label_layout_document_result(
            layout_document,
            app_features_context=self.app_features_context
        )
        header_layout_document = segmentation_label_result.get_filtered_document_by_label(
            '<header>'
        ).remove_empty_blocks()
        document = SemanticDocument()
        if self.config.extract_front:
            self._process_header_layout_document(
                header_layout_document=header_layout_document,
                semantic_document=document
            )

        if self.config.extract_body_sections:
            self._update_semantic_section_using_segmentation_result_and_fulltext_model(
                document.body_section,
                segmentation_label_result,
                '<body>',
                SemanticSectionTypes.OTHER
            )
        if self.config.extract_acknowledgements:
            self._update_semantic_section_using_segmentation_result_and_fulltext_model(
                document.back_section,
                segmentation_label_result,
                '<acknowledgement>',
                SemanticSectionTypes.ACKNOWLEDGEMENT
            )
        if self.config.extract_back_sections:
            self._update_semantic_section_using_segmentation_result_and_fulltext_model(
                document.back_section,
                segmentation_label_result,
                '<annex>',
                SemanticSectionTypes.OTHER
            )
        if self.config.extract_references:
            self._extract_raw_references_from_segmentation(
                semantic_document=document,
                segmentation_label_result=segmentation_label_result
            )
        if self.config.extract_citation_fields:
            self._extract_reference_fields_from_raw_references(
                semantic_document=document
            )
            if self.config.extract_citation_authors or self.config.extract_citation_editors:
                self._extract_reference_name_lists_from_raw_references(
                    semantic_document=document
                )
            references = list(document.iter_by_type_recursively(SemanticReference))
            ref_citations = list(document.iter_by_type_recursively(SemanticReferenceCitation))
            self._assign_content_ids(references, iter(iter_ids('b')))
            self._assign_target_content_ids(ref_citations, ChainedContentIdMatcher([
                SimpleContentIdMatcher(
                    self._get_semantic_content_text_by_content_id(references, SemanticLabel)
                ),
                PartialContentIdMatcher(
                    self._get_semantic_content_text_by_content_id(
                        references, SemanticRawReferenceText
                    )
                )
            ]))
        if self.config.extract_figure_fields:
            self._extract_figure_fields_from_raw_figures(semantic_document=document)
            figures = list(document.iter_by_type_recursively(SemanticFigure))
            figure_citations = list(document.iter_by_type_recursively(SemanticFigureCitation))
            self._assign_content_ids(figures, iter(iter_ids('fig_')))
            self._assign_target_content_ids(figure_citations, SimpleContentIdMatcher(
                self._get_semantic_content_text_by_content_id(figures, SemanticLabel)
            ))
        if self.config.extract_table_fields:
            self._extract_table_fields_from_raw_tables(semantic_document=document)
            tables = list(document.iter_by_type_recursively(SemanticTable))
            table_citations = list(document.iter_by_type_recursively(SemanticTableCitation))
            self._assign_content_ids(tables, iter(iter_ids('tab_')))
            self._assign_target_content_ids(table_citations, SimpleContentIdMatcher(
                self._get_semantic_content_text_by_content_id(tables, SemanticLabel)
            ))
        if self.config.extract_graphic_bounding_boxes:
            self._process_graphics(
                document=document,
                layout_document=layout_document,
                context=context
            )
        return document
 def test_should_return_empty_document(self):
     semantic_document = SemanticDocument()
     tei_document = get_tei_for_semantic_document(semantic_document)
     assert not tei_document.xpath('//tei:div')