def test_should_add_parsed_references(self): semantic_document = SemanticDocument() semantic_ref = SemanticReference([ SemanticTitle( layout_block=LayoutBlock.for_text('Reference Title 1')), SemanticRawReferenceText( layout_block=LayoutBlock.for_text('Reference 1')) ]) semantic_ref.content_id = 'b0' semantic_document.back_section.add_content( SemanticReferenceList([ SemanticHeading( layout_block=LayoutBlock.for_text('References')), semantic_ref ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl/tei:head' ) == ['References'] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl' '/tei:biblStruct/tei:analytic/tei:title[@type="main"]') == [ 'Reference Title 1' ] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl' '/tei:biblStruct/tei:note[@type="raw_reference"]') == [ 'Reference 1' ] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl' '/tei:biblStruct/@xml:id') == ['b0']
def test_should_add_section_figures_to_back(self): semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticSection([ SemanticFigure([ SemanticLabel( layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption( layout_block=LayoutBlock.for_text('Caption 1')) ], content_id='fig_0') ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) figure_xpath = ( '//tei:back/tei:div[@type="annex"]/tei:figure[not(contains(@type, "table"))]' ) assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:head') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:label') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:figDesc') == ['Caption 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/@xml:id') == ['fig_0'] assert not tei_document.xpath( '//tei:back/tei:div[@type="annex"]/tei:div')
def test_should_add_single_author(self): semantic_document = SemanticDocument() title = SemanticNameTitle(layout_block=LayoutBlock.for_text('Title1')) given_name = SemanticGivenName( layout_block=LayoutBlock.for_text('Given1')) middle_name = SemanticMiddleName( layout_block=LayoutBlock.for_text('Middle1')) surname = SemanticSurname( layout_block=LayoutBlock.for_text('Surname1')) suffix = SemanticNameSuffix( layout_block=LayoutBlock.for_text('Suffix1')) author = SemanticAuthor( [title, given_name, middle_name, surname, suffix]) semantic_document.front.add_content(author) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:author//tei:roleName') == ['Title1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="first"]') == ['Given1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="middle"]') == ['Middle1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:surname') == ['Surname1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:genName') == ['Suffix1']
def test_should_add_asset_citation_for_resolved_reference(self): semantic_document = SemanticDocument() semantic_document.body_section.add_content( SemanticSection([ SemanticParagraph([ SemanticTextContentWrapper( layout_block=LayoutBlock.for_text('See')), SemanticReferenceCitation( layout_block=LayoutBlock.for_text('Ref 1'), target_content_id='b0') ]), SemanticReferenceList([ SemanticReference([ SemanticLabel(layout_block=LayoutBlock.for_text('1')) ], content_id='b0') ]) ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p') == ['See Ref 1'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]') == ['Ref 1'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]/@target') == [ '#b0' ]
def test_should_set_abstract(self): semantic_document = SemanticDocument() semantic_document.front.add_content( SemanticAbstract(LayoutBlock.for_text(TOKEN_1))) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:abstract/tei:p') == [TOKEN_1]
def test_should_add_notes_to_body(self): semantic_document = SemanticDocument() semantic_document.body_section.add_note(LayoutBlock.for_text(TOKEN_1), 'other') tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:body/tei:note[@type="other"]') == [TOKEN_1]
def test_should_set_manuscript_title(self): semantic_document = SemanticDocument() semantic_document.front.add_content( SemanticTitle(layout_block=LayoutBlock.for_text(TOKEN_1))) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]' ) == [TOKEN_1]
def get_tei_document_for_layout_document( self, layout_document: LayoutDocument ) -> TeiDocument: return get_tei_for_semantic_document( self.get_semantic_document_for_layout_document( layout_document ) )
def test_should_create_back_section(self): semantic_document = SemanticDocument() section = semantic_document.back_section.add_new_section() section.add_heading_block(LayoutBlock.for_text(TOKEN_1)) paragraph = section.add_new_paragraph() paragraph.add_block_content(LayoutBlock.for_text(TOKEN_2)) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="annex"]/tei:div/tei:head') == [TOKEN_1] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="annex"]/tei:div/tei:p') == [TOKEN_2]
def test_should_unmatched_graphics_to_back(self): semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticMixedNote([ SemanticGraphic( layout_graphic=LayoutGraphic(coordinates=COORDINATES_1), relative_path='image1.svg') ], note_type='unmatched_graphics')) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) graphics_xpath = '//tei:note[@type="unmatched_graphics"]//tei:graphic' assert tei_document.xpath_nodes(graphics_xpath) assert tei_document.get_xpath_text_content_list( f'{graphics_xpath}/@url') == ['image1.svg']
def test_should_add_orphan_affiliation(self): semantic_document = SemanticDocument() aff_marker = SemanticMarker(layout_block=LayoutBlock.for_text('1')) institution = SemanticInstitution( layout_block=LayoutBlock.for_text('Institution1')) aff = SemanticAffiliationAddress([aff_marker, institution], content_id='aff0') semantic_document.front.add_content(aff) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]' ) == [aff.get_text()] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]/tei:label' ) == [aff_marker.get_text()] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/@key') == ['aff0']
def test_should_add_single_author_with_affiliation(self): semantic_document = SemanticDocument() title = SemanticNameTitle(layout_block=LayoutBlock.for_text('Title1')) given_name = SemanticGivenName( layout_block=LayoutBlock.for_text('Given1')) middle_name = SemanticMiddleName( layout_block=LayoutBlock.for_text('Middle1')) surname = SemanticSurname( layout_block=LayoutBlock.for_text('Surname1')) suffix = SemanticNameSuffix( layout_block=LayoutBlock.for_text('Suffix1')) author_marker = SemanticMarker(layout_block=LayoutBlock.for_text('1')) author = SemanticAuthor( [title, given_name, middle_name, surname, suffix, author_marker]) aff_marker = SemanticMarker(layout_block=LayoutBlock.for_text('1')) institution = SemanticInstitution( layout_block=LayoutBlock.for_text('Institution1')) aff = SemanticAffiliationAddress([aff_marker, institution], content_id='aff0') semantic_document.front.add_content(author) semantic_document.front.add_content(aff) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:author//tei:roleName') == ['Title1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="first"]') == ['Given1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="middle"]') == ['Middle1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:surname') == ['Surname1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:genName') == ['Suffix1'] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]' ) == [aff.get_text()] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/tei:note[@type="raw_affiliation"]/tei:label' ) == [aff_marker.get_text()] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/@key') == ['aff0'] assert tei_document.get_xpath_text_content_list( '//tei:author/tei:affiliation/tei:orgName[@type="institution"]' ) == [institution.get_text()]
def test_should_use_author_name_part_values(self): semantic_document = SemanticDocument() given_name = SemanticGivenName( layout_block=LayoutBlock.for_text('GIVEN1')) given_name.value = 'Given1' middle_name = SemanticMiddleName( layout_block=LayoutBlock.for_text('MIDDLE1')) middle_name.value = 'Middle1' surname = SemanticSurname( layout_block=LayoutBlock.for_text('SURNAME1')) surname.value = 'Surname1' author = SemanticAuthor([given_name, middle_name, surname]) semantic_document.front.add_content(author) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="first"]') == ['Given1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="middle"]') == ['Middle1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:surname') == ['Surname1']
def get_local_file_for_response_media_type( self, response_media_type: str) -> str: if response_media_type not in self.get_supported_response_media_type(): raise UnsupportedResponseMediaTypeScienceBeamParserError() tei_document = get_tei_for_semantic_document(self.semantic_document) xml_root = tei_document.root relative_xml_filename = 'tei.xml' if response_media_type in JATS_MEDIA_TYPES: xml_root = self._get_tei_to_jats_xml_root(xml_root) relative_xml_filename = 'jats.xml' local_xml_filename = os.path.join(self.temp_dir, relative_xml_filename) self._serialize_xml_to_file(xml_root, local_xml_filename) LOGGER.debug('local_xml_filename: %r', local_xml_filename) if response_media_type in ASSET_ZIP_MEDIA_TYPES: zip_filename = os.path.join(self.temp_dir, 'results.zip') create_asset_zip_for_semantic_document( zip_filename, semantic_document=self.semantic_document, local_xml_filename=local_xml_filename, relative_xml_filename=relative_xml_filename) return zip_filename return local_xml_filename
def test_should_add_raw_equation_with_label_to_paragraph(self): # to be consistent with Java GROBID semantic_document = SemanticDocument() semantic_document.body_section.add_content( SemanticSection([ SemanticParagraph([ SemanticTextContentWrapper( layout_block=LayoutBlock.for_text('Next')), SemanticRawEquation([ SemanticRawEquationContent( layout_block=LayoutBlock.for_text('Equation 1')), SemanticLabel(layout_block=LayoutBlock.for_text('(1)')) ]) ]), ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:formula') == ['Equation 1 (1)'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:formula/tei:label') == ['(1)'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p') == ['Next']
def test_should_add_section_tables_to_body(self): semantic_document = SemanticDocument() semantic_document.body_section.add_content( SemanticSection([ SemanticTable([ SemanticLabel( layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption( layout_block=LayoutBlock.for_text('Caption 1')) ], content_id='tab_0') ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) table_xpath = '//tei:body/tei:figure[@type="table"]' assert tei_document.get_xpath_text_content_list( f'{table_xpath}/tei:head') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{table_xpath}/tei:label') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{table_xpath}/tei:figDesc') == ['Caption 1'] assert tei_document.get_xpath_text_content_list( f'{table_xpath}/@xml:id') == ['tab_0'] assert not tei_document.xpath('//tei:body/tei:div')
def test_should_return_empty_document(self): semantic_document = SemanticDocument() tei_document = get_tei_for_semantic_document(semantic_document) assert not tei_document.xpath('//tei:div')