def test_should_add_single_author(self): semantic_document = SemanticDocument() title = SemanticNameTitle(layout_block=LayoutBlock.for_text('Title1')) given_name = SemanticGivenName( layout_block=LayoutBlock.for_text('Given1')) middle_name = SemanticMiddleName( layout_block=LayoutBlock.for_text('Middle1')) surname = SemanticSurname( layout_block=LayoutBlock.for_text('Surname1')) suffix = SemanticNameSuffix( layout_block=LayoutBlock.for_text('Suffix1')) author = SemanticAuthor( [title, given_name, middle_name, surname, suffix]) semantic_document.front.add_content(author) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:author//tei:roleName') == ['Title1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="first"]') == ['Given1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:forename[@type="middle"]') == ['Middle1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:surname') == ['Surname1'] assert tei_document.get_xpath_text_content_list( '//tei:author//tei:genName') == ['Suffix1']
def test_should_extract_single_affiliation_address(self): semantic_content_list = list(AffiliationAddressSemanticExtractor( ).iter_semantic_content_for_entity_blocks([ ('<marker>', LayoutBlock.for_text('1')), ('<institution>', LayoutBlock.for_text('Institution 1')), ('<department>', LayoutBlock.for_text('Department 1')), ('<laboratory>', LayoutBlock.for_text('Laboratory 1')), ('<addrLine>', LayoutBlock.for_text('Address Line 1')), ('<postCode>', LayoutBlock.for_text('Post Code 1')), ('<postBox>', LayoutBlock.for_text('Post Box 1')), ('<region>', LayoutBlock.for_text('Region 1')), ('<settlement>', LayoutBlock.for_text('Settlement 1')), ('<country>', LayoutBlock.for_text('Country 1')) ])) assert len(semantic_content_list) == 1 author = semantic_content_list[0] assert isinstance(author, SemanticAffiliationAddress) assert author.view_by_type(SemanticMarker).get_text() == '1' assert author.view_by_type( SemanticInstitution).get_text() == 'Institution 1' assert author.view_by_type( SemanticDepartment).get_text() == 'Department 1' assert author.view_by_type( SemanticLaboratory).get_text() == 'Laboratory 1' assert author.view_by_type( SemanticAddressLine).get_text() == 'Address Line 1' assert author.view_by_type( SemanticPostCode).get_text() == 'Post Code 1' assert author.view_by_type(SemanticPostBox).get_text() == 'Post Box 1' assert author.view_by_type(SemanticRegion).get_text() == 'Region 1' assert author.view_by_type( SemanticSettlement).get_text() == 'Settlement 1' assert author.view_by_type(SemanticCountry).get_text() == 'Country 1'
def test_should_add_asset_citation_for_resolved_reference(self): semantic_document = SemanticDocument() semantic_document.body_section.add_content( SemanticSection([ SemanticParagraph([ SemanticTextContentWrapper( layout_block=LayoutBlock.for_text('See')), SemanticReferenceCitation( layout_block=LayoutBlock.for_text('Ref 1'), target_content_id='b0') ]), SemanticReferenceList([ SemanticReference([ SemanticLabel(layout_block=LayoutBlock.for_text('1')) ], content_id='b0') ]) ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p') == ['See Ref 1'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]') == ['Ref 1'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]/@target') == [ '#b0' ]
def test_should_add_parsed_references(self): semantic_document = SemanticDocument() semantic_ref = SemanticReference([ SemanticTitle( layout_block=LayoutBlock.for_text('Reference Title 1')), SemanticRawReferenceText( layout_block=LayoutBlock.for_text('Reference 1')) ]) semantic_ref.content_id = 'b0' semantic_document.back_section.add_content( SemanticReferenceList([ SemanticHeading( layout_block=LayoutBlock.for_text('References')), semantic_ref ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl/tei:head' ) == ['References'] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl' '/tei:biblStruct/tei:analytic/tei:title[@type="main"]') == [ 'Reference Title 1' ] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl' '/tei:biblStruct/tei:note[@type="raw_reference"]') == [ 'Reference 1' ] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="references"]/tei:listBibl' '/tei:biblStruct/@xml:id') == ['b0']
def test_should_add_section_figures_to_back(self): semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticSection([ SemanticFigure([ SemanticLabel( layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption( layout_block=LayoutBlock.for_text('Caption 1')) ], content_id='fig_0') ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) figure_xpath = ( '//tei:back/tei:div[@type="annex"]/tei:figure[not(contains(@type, "table"))]' ) assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:head') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:label') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:figDesc') == ['Caption 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/@xml:id') == ['fig_0'] assert not tei_document.xpath( '//tei:back/tei:div[@type="annex"]/tei:div')
def test_should_provide_page_and_block_status_for_single_token_blocks( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_text('line1'), LayoutBlock.for_text('line2'), LayoutBlock.for_text('line3') ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'page_status': features.get_page_status(), 'block_status': features.get_block_status() }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [{ 'page_status': 'PAGESTART', 'block_status': 'BLOCKSTART' }, { 'page_status': 'PAGEIN', 'block_status': 'BLOCKSTART' }, { 'page_status': 'PAGEEND', 'block_status': 'BLOCKSTART' }]
def test_should_extract_references_fields_from_document( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): other_body = LayoutBlock.for_text('the body') citation_block = LayoutBlock.for_text('1') body_block = LayoutBlock.merge_blocks([other_body, citation_block]) label_block = LayoutBlock.for_text('1') ref_title_block = LayoutBlock.for_text('Reference Title 1') ref_text_block = LayoutBlock.merge_blocks([ref_title_block]) ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_citation_fields=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock citation_model_mock = fulltext_models_mock.citation_model_mock segmentation_model_mock.update_label_by_layout_block( body_block, '<body>') segmentation_model_mock.update_label_by_layout_block( ref_block, '<references>') fulltext_model_mock.update_label_by_layout_block( other_body, '<section>') fulltext_model_mock.update_label_by_layout_block( citation_block, '<citation_marker>') reference_segmenter_model_mock.update_label_by_layout_block( label_block, '<label>') reference_segmenter_model_mock.update_label_by_layout_block( ref_text_block, '<reference>') citation_model_mock.update_label_by_layout_block( ref_title_block, '<title>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[body_block, ref_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None reference_list = list( semantic_document.back_section.iter_by_type(SemanticReferenceList)) assert len(reference_list) == 1 references = list(reference_list[0].iter_by_type(SemanticReference)) assert len(references) == 1 ref = references[0] assert ref.get_text_by_type(SemanticTitle) == ref_title_block.text assert ref.get_text_by_type(SemanticLabel) == label_block.text assert ref.get_text_by_type( SemanticRawReferenceText) == ref_text_block.text assert ref.content_id == 'b0' ref_citations = list( semantic_document.iter_by_type_recursively( SemanticReferenceCitation)) assert len(ref_citations) == 1 assert ref_citations[0].target_content_id == 'b0'
def test_should_set_title_and_abstract(self): semantic_content_list = list( HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([ ('<title>', LayoutBlock.for_text(TITLE_1)), ('<abstract>', LayoutBlock.for_text(ABSTRACT_1)) ])) front = SemanticFront(semantic_content_list) LOGGER.debug('front: %s', front) assert front.get_text_by_type(SemanticTitle) == TITLE_1 assert front.get_text_by_type(SemanticAbstract) == ABSTRACT_1
def test_should_not_strip_dot_from_label(self): semantic_heading = SemanticHeading([ SemanticLabel(layout_block=LayoutBlock.for_text('1.')), SemanticTitle(layout_block=LayoutBlock.for_text('Section Title 1')) ]) tei_head = get_tei_child_element_for_semantic_content(semantic_heading) LOGGER.debug('tei_head: %r', etree.tostring(tei_head)) assert tei_head.attrib.get('n') == '1.' assert get_text_content(tei_head) == 'Section Title 1' assert not list(tei_head)
def test_should_extract_single_figure(self): semantic_content_list = list( FigureSemanticExtractor().iter_semantic_content_for_entity_blocks([ ('<label>', LayoutBlock.for_text('Figure 1')), ('<figDesc>', LayoutBlock.for_text('Caption 1')) ])) assert len(semantic_content_list) == 1 figure = semantic_content_list[0] assert isinstance(figure, SemanticFigure) assert figure.view_by_type(SemanticLabel).get_text() == 'Figure 1' assert figure.view_by_type(SemanticCaption).get_text() == 'Caption 1'
def test_should_be_able_to_extract_single_editor(self): semantic_content_list = list( NameSemanticExtractor().iter_semantic_content_for_entity_blocks( [('<forename>', LayoutBlock.for_text('John')), ('<surname>', LayoutBlock.for_text('Smith'))], name_type=SemanticEditor)) assert len(semantic_content_list) == 1 author = semantic_content_list[0] assert isinstance(author, SemanticEditor) assert author.given_name_text == 'John' assert author.surname_text == 'Smith'
def test_should_render_graphic_element(self): semantic_figure = SemanticFigure([ SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1')), SemanticGraphic(layout_graphic=LayoutGraphic( local_file_path='image1.png')) ], content_id='fig_0') result = _get_wrapped_figure_tei_element(semantic_figure) assert result.get_xpath_text_content_list( f'{FIGURE_XPATH}/tei:graphic')
def test_should_remove_trailing_dot_from_country(self): semantic_content_list = list(AffiliationAddressSemanticExtractor( ).iter_semantic_content_for_entity_blocks([ ('<marker>', LayoutBlock.for_text('1')), ('<country>', LayoutBlock.for_text('Country1.')) ])) assert len(semantic_content_list) == 1 aff1 = semantic_content_list[0] assert isinstance(aff1, SemanticAffiliationAddress) assert aff1.view_by_type(SemanticMarker).get_text() == '1' assert aff1.view_by_type(SemanticCountry).get_text() == 'Country1'
def test_should_extract_single_raw_reference(self): semantic_content_list = list(ReferenceSegmenterSemanticExtractor( ).iter_semantic_content_for_entity_blocks([ ('<label>', LayoutBlock.for_text('1')), ('<reference>', LayoutBlock.for_text('Reference 1')) ])) assert len(semantic_content_list) == 1 ref = semantic_content_list[0] assert isinstance(ref, SemanticRawReference) assert ref.view_by_type(SemanticLabel).get_text() == '1' assert ref.view_by_type( SemanticRawReferenceText).get_text() == 'Reference 1'
def test_should_add_paragraphs_without_title(self): semantic_content_list = list(FullTextSemanticExtractor( ).iter_semantic_content_for_entity_blocks([ ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)), ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2)) ])) assert len(semantic_content_list) == 1 section = semantic_content_list[0] assert isinstance(section, SemanticSection) assert section.get_paragraph_text_list() == [ SECTION_PARAGRAPH_1, SECTION_PARAGRAPH_2 ]
def test_should_raw_table_for_table_text_to_section(self): semantic_content_list = list(FullTextSemanticExtractor( ).iter_semantic_content_for_entity_blocks([ ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)), ('<table>', LayoutBlock.for_text(SECTION_PARAGRAPH_2)) ])) assert len(semantic_content_list) == 1 section = semantic_content_list[0] assert isinstance(section, SemanticSection) assert section.get_paragraph_text_list() == [SECTION_PARAGRAPH_1] assert section.get_text_by_type( SemanticRawTable) == SECTION_PARAGRAPH_2
def test_should_add_all_fields(self): semantic_affiliation_address = SemanticAffiliationAddress([ SemanticMarker(layout_block=LayoutBlock.for_text('1')), SemanticInstitution(layout_block=LayoutBlock.for_text('Institution1')), SemanticDepartment(layout_block=LayoutBlock.for_text('Department1')), SemanticLaboratory(layout_block=LayoutBlock.for_text('Lab1')), SemanticAddressLine(layout_block=LayoutBlock.for_text('AddressLine1')), SemanticPostCode(layout_block=LayoutBlock.for_text('PostCode1')), SemanticPostBox(layout_block=LayoutBlock.for_text('PostBox1')), SemanticRegion(layout_block=LayoutBlock.for_text('Region1')), SemanticSettlement(layout_block=LayoutBlock.for_text('Settlement1')), SemanticCountry(layout_block=LayoutBlock.for_text('Country1')), ]) tei_aff = TeiElementWrapper( get_tei_affiliation_for_semantic_affiliation_address_element( semantic_affiliation_address, context=DEFAULT_TEI_ELEMENT_FACTORY_CONTEXT ) ) LOGGER.debug('tei_aff: %r', etree.tostring(tei_aff.element)) assert tei_aff.get_xpath_text_content_list( 'tei:note[@type="raw_affiliation"]' ) == [semantic_affiliation_address.get_text()] assert tei_aff.get_xpath_text_content_list( 'tei:note[@type="raw_affiliation"]/tei:label' ) == ['1'] assert tei_aff.get_xpath_text_content_list( 'tei:orgName[@type="institution"]' ) == ['Institution1'] assert tei_aff.get_xpath_text_content_list( 'tei:orgName[@type="department"]' ) == ['Department1'] assert tei_aff.get_xpath_text_content_list( 'tei:orgName[@type="laboratory"]' ) == ['Lab1'] assert tei_aff.get_xpath_text_content_list( 'tei:address/tei:addrLine' ) == ['AddressLine1'] assert tei_aff.get_xpath_text_content_list( 'tei:address/tei:postCode' ) == ['PostCode1'] assert tei_aff.get_xpath_text_content_list( 'tei:address/tei:postBox' ) == ['PostBox1'] assert tei_aff.get_xpath_text_content_list( 'tei:address/tei:region' ) == ['Region1'] assert tei_aff.get_xpath_text_content_list( 'tei:address/tei:settlement' ) == ['Settlement1'] assert tei_aff.get_xpath_text_content_list( 'tei:address/tei:country' ) == ['Country1']
def test_should_create_back_section(self): semantic_document = SemanticDocument() section = semantic_document.back_section.add_new_section() section.add_heading_block(LayoutBlock.for_text(TOKEN_1)) paragraph = section.add_new_paragraph() paragraph.add_block_content(LayoutBlock.for_text(TOKEN_2)) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="annex"]/tei:div/tei:head') == [TOKEN_1] assert tei_document.get_xpath_text_content_list( '//tei:back/tei:div[@type="annex"]/tei:div/tei:p') == [TOKEN_2]
def test_should_ignore_additional_title_and_abstract(self): # Note: this behaviour should be reviewed semantic_content_list = list( HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([ ('<title>', LayoutBlock.for_text(TITLE_1)), ('<abstract>', LayoutBlock.for_text(ABSTRACT_1)), ('<title>', LayoutBlock.for_text('other')), ('<abstract>', LayoutBlock.for_text('other')) ])) front = SemanticFront(semantic_content_list) LOGGER.debug('front: %s', front) assert front.get_text_by_type(SemanticTitle) == TITLE_1 assert front.get_text_by_type(SemanticAbstract) == ABSTRACT_1
def test_should_add_note_for_other_text_to_body(self): semantic_content_list = list(FullTextSemanticExtractor( ).iter_semantic_content_for_entity_blocks([ ('O', LayoutBlock.for_text(SECTION_PARAGRAPH_1)), ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2)) ])) parent_section = SemanticSection(semantic_content_list) assert parent_section.get_notes_text_list('fulltext:other') == [ SECTION_PARAGRAPH_1 ] sections = parent_section.sections assert len(sections) == 1 assert sections[0].get_paragraph_text_list() == [SECTION_PARAGRAPH_2]
def test_should_extract_table_label_caption_from_body( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels, segmentation_label: str): citation_block = LayoutBlock.for_text('Table 1') label_block = LayoutBlock.for_text('Table 1') caption_block = LayoutBlock.for_text('Caption 1') other_block = LayoutBlock.for_text('Other') figure_block = LayoutBlock.merge_blocks( [label_block, other_block, caption_block]) fulltext_block = LayoutBlock.merge_blocks( [citation_block, figure_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_table_fields=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock table_model_mock = fulltext_models_mock.table_model_mock segmentation_model_mock.update_label_by_layout_block( fulltext_block, segmentation_label) fulltext_model_mock.update_label_by_layout_block( citation_block, '<table_marker>') fulltext_model_mock.update_label_by_layout_block( figure_block, '<table>') table_model_mock.update_label_by_layout_block(label_block, '<label>') table_model_mock.update_label_by_layout_block(caption_block, '<figDesc>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[fulltext_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None table_list = list( iter_by_semantic_type_recursively([ semantic_document.body_section, semantic_document.back_section ], SemanticTable)) assert len(table_list) == 1 table = table_list[0] assert table.get_text_by_type(SemanticLabel) == label_block.text assert table.get_text_by_type(SemanticCaption) == caption_block.text assert table.content_id == 'tab_0' table_citation_list = list( semantic_document.iter_by_type_recursively(SemanticTableCitation)) assert len(table_citation_list) == 1 assert table_citation_list[0].get_text() == citation_block.text assert table_citation_list[0].target_content_id == 'tab_0'
def test_should_split_raw_affiliation_on_new_aff_without_address(self): semantic_content_list = list( HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([ ('<affiliation>', LayoutBlock.for_text(AFFILIATION_1)), ('<affiliation>', LayoutBlock.for_text(AFFILIATION_2)) ])) front = SemanticFront(semantic_content_list) LOGGER.debug('front: %s', front) aff_address_list = list( front.iter_by_type(SemanticRawAffiliationAddress)) assert [ aff_address.get_text_by_type(SemanticRawAffiliation) for aff_address in aff_address_list ] == [AFFILIATION_1, AFFILIATION_2]
def test_should_add_separate_section_label(self): semantic_content_list = list(FullTextSemanticExtractor( ).iter_semantic_content_for_entity_blocks([ ('<section>', LayoutBlock.for_text('1 ' + SECTION_TITLE_1)), ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)) ])) assert len(semantic_content_list) == 1 section = semantic_content_list[0] assert isinstance(section, SemanticSection) semantic_headings = list(section.iter_by_type(SemanticHeading)) assert len(semantic_headings) == 1 assert semantic_headings[0].get_text_by_type(SemanticLabel) == '1' assert semantic_headings[0].get_text_by_type( SemanticTitle) == SECTION_TITLE_1
def test_should_extract_editor_names_from_references_fields( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): given_name_block = LayoutBlock.for_text('Given name') surname_block = LayoutBlock.for_text('Surname') other_block = LayoutBlock.for_text('Other') editors_block = LayoutBlock.merge_blocks( [given_name_block, other_block, surname_block]) ref_text_block = LayoutBlock.merge_blocks([editors_block]) ref_block = LayoutBlock.merge_blocks([ref_text_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_citation_fields=True, extract_citation_authors=False, extract_citation_editors=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock citation_model_mock = fulltext_models_mock.citation_model_mock name_citation_model_mock = fulltext_models_mock.name_citation_model_mock segmentation_model_mock.update_label_by_layout_block( ref_block, '<references>') reference_segmenter_model_mock.update_label_by_layout_block( ref_text_block, '<reference>') citation_model_mock.update_label_by_layout_block( editors_block, '<editor>') name_citation_model_mock.update_label_by_layout_block( given_name_block, '<forename>') name_citation_model_mock.update_label_by_layout_block( surname_block, '<surname>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[ref_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None reference_list = list( semantic_document.back_section.iter_by_type(SemanticReferenceList)) assert len(reference_list) == 1 references = list(reference_list[0].iter_by_type(SemanticReference)) assert len(references) == 1 ref = references[0] editors = list(ref.iter_by_type(SemanticEditor)) assert len(editors) == 1 assert editors[0].given_name_text == given_name_block.text assert editors[0].surname_text == surname_block.text
def test_should_extract_affiliation_address_from_document( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): marker_block = LayoutBlock.for_text('1') institution_block = LayoutBlock.for_text('Institution1') country_block = LayoutBlock.for_text('Country1') aff_block = LayoutBlock.merge_blocks([marker_block, institution_block]) address_block = LayoutBlock.merge_blocks([country_block]) aff_address_block = LayoutBlock.merge_blocks( [aff_block, address_block]) fulltext_processor = FullTextProcessor(fulltext_models_mock) header_block = aff_address_block segmentation_model_mock = fulltext_models_mock.segmentation_model_mock header_model_mock = fulltext_models_mock.header_model_mock affiliation_address_model_mock = fulltext_models_mock.affiliation_address_model_mock segmentation_model_mock.update_label_by_layout_block( header_block, '<header>') header_model_mock.update_label_by_layout_block(aff_block, '<affiliation>') header_model_mock.update_label_by_layout_block(address_block, '<address>') affiliation_address_model_mock.update_label_by_layout_block( marker_block, '<marker>') affiliation_address_model_mock.update_label_by_layout_block( institution_block, '<institution>') affiliation_address_model_mock.update_label_by_layout_block( country_block, '<country>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[header_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) assert semantic_document is not None assert semantic_document.front.get_text() == aff_address_block.text assert (semantic_document.front.view_by_type( SemanticAffiliationAddress).get_text()) == aff_address_block.text affiliations = list( semantic_document.front.iter_by_type(SemanticAffiliationAddress)) assert len(affiliations) == 1 assert affiliations[0].get_text_by_type( SemanticMarker) == marker_block.text assert affiliations[0].get_text_by_type( SemanticInstitution) == institution_block.text assert affiliations[0].get_text_by_type( SemanticCountry) == country_block.text assert affiliations[0].content_id == 'aff0'
def test_should_add_raw_affiliation_address(self): semantic_content_list = list( HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([ ('<affiliation>', LayoutBlock.for_text(AFFILIATION_1)), ('<address>', LayoutBlock.for_text(ADDRESS_1)) ])) front = SemanticFront(semantic_content_list) LOGGER.debug('front: %s', front) aff_address_list = list( front.iter_by_type(SemanticRawAffiliationAddress)) assert len(aff_address_list) == 1 aff_address = aff_address_list[0] assert aff_address.get_text_by_type( SemanticRawAffiliation) == AFFILIATION_1 assert aff_address.get_text_by_type(SemanticRawAddress) == ADDRESS_1
def test_should_include_reference_citation_in_paragraph(self): semantic_content_list = list(FullTextSemanticExtractor( ).iter_semantic_content_for_entity_blocks([ ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)), ('<citation_marker>', LayoutBlock.for_text('Ref 1')), ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2)), ])) LOGGER.debug('semantic_content_list: %s', semantic_content_list) assert len(semantic_content_list) == 1 section = semantic_content_list[0] assert isinstance(section, SemanticSection) reference_citations = list( section.iter_by_type_recursively(SemanticReferenceCitation)) assert len(reference_citations) == 1 assert reference_citations[0].get_text() == 'Ref 1'
def test_should_extract_preceeding_other_text(self): semantic_content_list = list(AffiliationAddressSemanticExtractor( ).iter_semantic_content_for_entity_blocks([ ('O', LayoutBlock.for_text('Other 1')), ('<marker>', LayoutBlock.for_text('1')), ('<institution>', LayoutBlock.for_text('Institution 1')), ])) assert len(semantic_content_list) == 2 note = semantic_content_list[0] assert isinstance(note, SemanticNote) author = semantic_content_list[1] assert isinstance(author, SemanticAffiliationAddress) assert author.view_by_type(SemanticMarker).get_text() == '1' assert author.view_by_type( SemanticInstitution).get_text() == 'Institution 1'
def test_should_reject_reference_without_any_detected_fields(self): semantic_raw_ref_text = SemanticRawReferenceText( layout_block=LayoutBlock.for_text('Reference 1')) semantic_raw_ref = SemanticRawReference([semantic_raw_ref_text], content_id='raw1') semantic_content_list = list(CitationSemanticExtractor( ).iter_semantic_content_for_entity_blocks( [ ('O', LayoutBlock.for_text(semantic_raw_ref_text.get_text())), ], semantic_raw_reference=semantic_raw_ref)) assert len(semantic_content_list) == 1 ref = semantic_content_list[0] assert isinstance(ref, SemanticInvalidReference) assert ref.get_text() == semantic_raw_ref_text.get_text()
def test_should_render_label_description_and_id(self): semantic_figure = SemanticFigure([ SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1')) ], content_id='fig_0') result = _get_wrapped_figure_tei_element(semantic_figure) assert result.get_xpath_text_content_list( f'{FIGURE_XPATH}/tei:head') == ['Label 1'] assert result.get_xpath_text_content_list( f'{FIGURE_XPATH}/tei:label') == ['Label 1'] assert result.get_xpath_text_content_list( f'{FIGURE_XPATH}/tei:figDesc') == ['Caption 1'] assert result.get_xpath_text_content_list( f'{FIGURE_XPATH}/@xml:id') == ['fig_0']