def get_tei_element_for_semantic_content( self, semantic_content: SemanticContentWrapper, context: TeiElementFactoryContext) -> etree.ElementBase: LOGGER.debug('semantic_content: %s', semantic_content) assert isinstance(semantic_content, SemanticTable) semantic_table = semantic_content children = [ context.get_default_attributes_for_semantic_content(semantic_table) ] for child_semantic_content in semantic_table: if isinstance(child_semantic_content, SemanticLabel): layout_block = child_semantic_content.merged_block children.append( TEI_E( 'head', *context.iter_layout_block_tei_children(layout_block))) children.append( TEI_E( 'label', *context.iter_layout_block_tei_children(layout_block))) continue if isinstance(child_semantic_content, SemanticCaption): children.append( TEI_E( 'figDesc', *context.iter_layout_block_tei_children( child_semantic_content.merged_block))) continue children.extend( context.get_tei_child_elements_for_semantic_content( child_semantic_content)) return TEI_E('figure', {'type': 'table'}, *children)
def get_tei_author_for_semantic_author_element( semantic_author: SemanticAuthor, context: TeiElementFactoryContext, affiliations_by_marker: Optional[Mapping[str, Sequence[SemanticAffiliationAddress]]] = None ) -> etree.ElementBase: if affiliations_by_marker is None: affiliations_by_marker = {} LOGGER.debug('semantic_author: %s', semantic_author) pers_name_children = [] for semantic_content in semantic_author: pers_name_children.extend(context.get_tei_child_elements_for_semantic_content( semantic_content )) children = [ TEI_E( 'persName', context.get_default_attributes_for_semantic_content(semantic_author), *pers_name_children ) ] affiliations = [] for marker_text in semantic_author.view_by_type(SemanticMarker).get_text_list(): semantic_affiliations = affiliations_by_marker.get(marker_text) if not semantic_affiliations: LOGGER.warning('affiliation not found for marker: %r', marker_text) continue for semantic_affiliation in semantic_affiliations: affiliations.append(get_tei_affiliation_for_semantic_affiliation_address_element( semantic_affiliation, context=context )) children.extend(affiliations) return TEI_E('author', *children)
def get_tei_reference_element( # pylint: disable=too-many-branches semantic_ref: SemanticReference, context: TeiElementFactoryContext) -> etree.ElementBase: LOGGER.debug('semantic_ref: %s', semantic_ref) tei_ref = TeiElementBuilder( TEI_E( 'biblStruct', context.get_default_attributes_for_semantic_content(semantic_ref))) is_first_date = True for semantic_content in semantic_ref: parent_path = context.get_parent_path_for_semantic_content( semantic_content) tei_child_parent = tei_ref.get_or_create(parent_path) if isinstance(semantic_content, SemanticLabel): tei_child_parent.append( create_tei_note_element('label', semantic_content.merged_block)) continue if isinstance(semantic_content, SemanticRawReferenceText): tei_child_parent.append( create_tei_note_element('raw_reference', semantic_content.merged_block)) continue if isinstance(semantic_content, SemanticTitle): tei_child_parent.append( TEI_E( 'title', { 'level': 'a', 'type': 'main' }, *context.iter_layout_block_tei_children( semantic_content.merged_block))) continue if isinstance(semantic_content, SemanticAuthor): tei_child_parent.append( get_tei_author_for_semantic_author_element(semantic_content, context=context)) continue if isinstance(semantic_content, SemanticDate): tei_child_parent = tei_ref.get_or_create(['monogr', 'imprint']) attrib = {} if is_first_date: # assume first date is published date (more or less matches GROBID) attrib['type'] = 'published' if semantic_content.year: attrib['when'] = str(semantic_content.year) tei_child_parent.append( TEI_E( 'date', attrib, *context.iter_layout_block_tei_children( layout_block=semantic_content.merged_block))) is_first_date = False continue tei_child_parent.extend( context.get_tei_child_elements_for_semantic_content( semantic_content)) return tei_ref.element
def get_tei_element_for_semantic_content( self, semantic_content: SemanticContentWrapper, context: TeiElementFactoryContext) -> etree.ElementBase: assert isinstance(semantic_content, SemanticPageRange) page_range = semantic_content if not page_range.from_page or not page_range.to_page: return TEI_E('biblScope', {'unit': 'page'}, page_range.get_text()) return TEI_E('biblScope', { 'unit': 'page', 'from': page_range.from_page, 'to': page_range.to_page })
def get_dummy_tei_author_for_semantic_affiliations_element( semantic_affiliations: Sequence[SemanticAffiliationAddress], context: TeiElementFactoryContext ) -> etree.ElementBase: children = [ TEI_E('note', {'type': 'dummy_author'}, 'Dummy author for orphan affiliations') ] children.extend([ get_tei_affiliation_for_semantic_affiliation_address_element( semantic_affiliation, context=context ) for semantic_affiliation in semantic_affiliations ]) return TEI_E('author', *children)
def get_tei_children_for_semantic_content( self, semantic_content: SemanticContentWrapper, context: TeiElementFactoryContext) -> List[etree.ElementBase]: LOGGER.debug('semantic_content: %s', semantic_content) assert isinstance(semantic_content, SemanticRawEquation) semantic_raw_equation = semantic_content children: T_ElementChildrenList = [ context.get_default_attributes_for_semantic_content( semantic_raw_equation) ] pending_whitespace = '' for child_semantic_content in semantic_raw_equation: if isinstance(child_semantic_content, SemanticRawEquationContent): layout_block = child_semantic_content.merged_block if pending_whitespace: children.append(pending_whitespace) children.extend( context.iter_layout_block_tei_children(layout_block)) pending_whitespace = layout_block.whitespace continue pending_whitespace = context.append_tei_children_list_and_get_whitespace( children, child_semantic_content, pending_whitespace=pending_whitespace) return [TEI_E('formula', *children)]
def _get_wrapped_figure_tei_element( semantic_figure: SemanticFigure) -> TeiElementWrapper: return TeiElementWrapper( TEI_E( 'root', *FigureTeiElementFactory().get_tei_children_for_semantic_content( semantic_figure, context=DEFAULT_TEI_ELEMENT_FACTORY_CONTEXT)))
def get_tei_children_for_semantic_content( self, semantic_content: SemanticContentWrapper, context: TeiElementFactoryContext) -> List[etree.ElementBase]: LOGGER.debug('semantic_content: %s', semantic_content) assert isinstance(semantic_content, SemanticParagraph) semantic_paragraph = semantic_content result: List[etree.ElementBase] = [] for flat_parent_semantic_content in iter_flat_paragraph_formula( semantic_paragraph): if not isinstance(flat_parent_semantic_content, SemanticParagraph): result.extend( context.get_tei_child_elements_for_semantic_content( flat_parent_semantic_content)) continue children: T_ElementChildrenList = [ context.get_default_attributes_for_semantic_content( flat_parent_semantic_content) ] pending_whitespace = '' for child_semantic_content in flat_parent_semantic_content: pending_whitespace = context.append_tei_children_list_and_get_whitespace( children, child_semantic_content, pending_whitespace=pending_whitespace) result.append(TEI_E('p', *children)) return result
def __init__(self, root: Optional[etree.ElementBase] = None): if root is None: self.root = TEI_E('TEI') else: self.root = root self._reference_element: Optional[etree.ElementBase] = None super().__init__(self.root)
def set_title_layout_block(self, title_block: LayoutBlock): self.set_child_element_at( ['teiHeader', 'fileDesc', 'titleStmt'], TEI_E('title', { 'level': 'a', 'type': 'main' }, *iter_layout_block_tei_children(title_block)))
def test_should_add_superscript_text(self): block = LayoutBlock.for_tokens([ LayoutToken(TOKEN_1), LayoutToken(TOKEN_2, font=SUPERSCRIPT_FONT_1), LayoutToken(TOKEN_3) ]) node = TEI_E.node(*iter_layout_block_tei_children(block)) assert get_tei_xpath_text_content_list( node, './tei:hi[@rend="superscript"]') == [TOKEN_2] assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
def _get_wrapped_graphic_tei_element( semantic_graphic: SemanticGraphic ) -> TeiElementWrapper: return TeiElementWrapper(TEI_E( 'root', *GraphicTeiElementFactory().get_tei_children_for_semantic_content( semantic_graphic, context=DEFAULT_TEI_ELEMENT_FACTORY_CONTEXT ) ))
def test_should_add_bold_and_italics_text(self): block = LayoutBlock.for_tokens([ LayoutToken(TOKEN_1), LayoutToken(TOKEN_2, font=BOLD_ITALICS_FONT_1), LayoutToken(TOKEN_3) ]) node = TEI_E.node(*iter_layout_block_tei_children(block)) LOGGER.debug('xml: %r', etree.tostring(node)) assert get_tei_xpath_text_content_list( node, './/tei:hi[@rend="bold"]') == [TOKEN_2] assert get_tei_xpath_text_content_list( node, './/tei:hi[@rend="italic"]') == [TOKEN_2] assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
def get_tei_affiliation_for_semantic_affiliation_address_element( semantic_affiliation_address: SemanticAffiliationAddress, context: TeiElementFactoryContext ) -> etree.ElementBase: LOGGER.debug('semantic_affiliation_address: %s', semantic_affiliation_address) raw_affiliation = _get_tei_raw_affiliation_element_for_semantic_affiliation_address( semantic_affiliation_address, context=context ) attributes = context.get_default_attributes_for_semantic_content( semantic_affiliation_address ) if semantic_affiliation_address.content_id: attributes = {**attributes, 'key': semantic_affiliation_address.content_id} if XML_ID in attributes: del attributes[XML_ID] children = [ attributes, raw_affiliation ] address_semantic_content_list = [] for semantic_content in semantic_affiliation_address: if isinstance(semantic_content, SemanticAddressField): address_semantic_content_list.append(semantic_content) continue children.extend(context.get_tei_child_elements_for_semantic_content( semantic_content )) LOGGER.debug('address_semantic_content_list: %r', address_semantic_content_list) if address_semantic_content_list: children.append(TEI_E('address', *[ child for semantic_content in address_semantic_content_list for child in context.get_tei_child_elements_for_semantic_content( semantic_content ) ])) return TEI_E('affiliation', *children)
def get_tei_element_for_semantic_content( self, semantic_content: SemanticContentWrapper, context: TeiElementFactoryContext) -> etree.ElementBase: assert isinstance(semantic_content, SemanticCitation) citation = semantic_content citation_type = CITATION_TYPE_BY_SEMANTIC_CLASS.get(type(citation)) attributes = {} if citation_type: attributes['type'] = citation_type if citation.target_content_id: attributes['target'] = '#' + citation.target_content_id return TEI_E( 'ref', attributes, *context.iter_layout_block_tei_children(citation.merged_block))
def _get_tei_raw_affiliation_element_for_semantic_affiliation_address( semantic_affiliation_address: SemanticAffiliationAddress, context: TeiElementFactoryContext ) -> etree.ElementBase: children: List[Union[str, dict, etree.ElementBase]] = [] children.append({'type': 'raw_affiliation'}) pending_whitespace: str = '' for semantic_content in semantic_affiliation_address: merged_block = semantic_content.merged_block if pending_whitespace: children.append(pending_whitespace) if isinstance(semantic_content, SemanticMarker): children.append(TEI_E( 'label', *context.iter_layout_block_tei_children(merged_block, enable_coordinates=False) )) pending_whitespace = merged_block.whitespace continue children.extend( context.iter_layout_block_tei_children(merged_block, enable_coordinates=False) ) pending_whitespace = merged_block.whitespace return TEI_E('note', *children)
def get_tei_element_for_semantic_content( self, semantic_content: SemanticContentWrapper, context: TeiElementFactoryContext) -> etree.ElementBase: LOGGER.debug('semantic_content: %s', semantic_content) assert isinstance(semantic_content, SemanticMixedNote) semantic_mixed_note = semantic_content note_type = semantic_mixed_note.note_type or 'other' children = [ context.get_default_attributes_for_semantic_content( semantic_mixed_note) ] children.append({'type': note_type}) for child_semantic_content in semantic_mixed_note: children.extend( context.get_tei_child_elements_for_semantic_content( child_semantic_content)) return TEI_E('note', *children)
def get_tei_raw_reference_list_element( semantic_reference_list: SemanticReferenceList, context: TeiElementFactoryContext) -> etree.ElementBase: tei_reference_list = TeiElementBuilder(TEI_E('listBibl')) for semantic_content in semantic_reference_list: if isinstance(semantic_content, SemanticRawReference): tei_reference_list.append( _get_tei_raw_reference_element(semantic_content, context=context)) continue if isinstance(semantic_content, SemanticReference): tei_reference_list.append( get_tei_reference_element(semantic_content, context=context)) continue tei_reference_list.extend( context.get_tei_child_elements_for_semantic_content( semantic_content)) return tei_reference_list.element
def get_tei_element_for_semantic_content( self, semantic_content: SemanticContentWrapper, context: TeiElementFactoryContext ) -> etree.ElementBase: assert isinstance(semantic_content, SemanticExternalIdentifier) external_identifier = semantic_content if LOGGER.isEnabledFor(logging.DEBUG): LOGGER.debug( 'external_identifier: type=%r, value=%r, text=%r, content=%r', external_identifier.external_identifier_type, external_identifier.value, external_identifier.get_text(), external_identifier ) attributes = context.get_default_attributes_for_semantic_content(external_identifier) if external_identifier.external_identifier_type: attributes = {**attributes, 'type': external_identifier.external_identifier_type} return TEI_E('idno', attributes, external_identifier.value)
def _get_tei_raw_reference_element( semantic_raw_ref: SemanticRawReference, context: TeiElementFactoryContext) -> etree.ElementBase: LOGGER.debug('semantic_raw_ref: %s', semantic_raw_ref) children = [] for semantic_content in semantic_raw_ref: if isinstance(semantic_content, SemanticRawReferenceText): children.append( create_tei_note_element('raw_reference', semantic_content.merged_block)) continue children.extend( context.get_tei_child_elements_for_semantic_content( semantic_content)) tei_ref = TEI_E( 'biblStruct', context.get_default_attributes_for_semantic_content(semantic_raw_ref), *children) return tei_ref
def get_tei_element_for_semantic_content( self, semantic_content: SemanticContentWrapper, context: TeiElementFactoryContext) -> etree.ElementBase: LOGGER.debug('semantic_content: %s', semantic_content) assert isinstance(semantic_content, SemanticGraphic) semantic_graphic = semantic_content layout_graphic = semantic_graphic.layout_graphic children = [ context.get_default_attributes_for_semantic_content( semantic_graphic, enable_coordinates=False) ] if semantic_graphic.relative_path: children.append({'url': semantic_graphic.relative_path}) if layout_graphic: if layout_graphic.coordinates: children.append( {'coords': format_coordinates(layout_graphic.coordinates)}) if layout_graphic.graphic_type: children.append({'type': layout_graphic.graphic_type}) return TEI_E('graphic', *children)
def get_tei_children_for_semantic_content( self, semantic_content: SemanticContentWrapper, context: TeiElementFactoryContext) -> List[etree.ElementBase]: LOGGER.debug('semantic_content: %s', semantic_content) assert isinstance(semantic_content, SemanticSection) semantic_section = semantic_content tei_section = TeiElementBuilder(TEI_E('div')) for child_semantic_content in semantic_section: if isinstance(child_semantic_content, ( SemanticFigure, SemanticTable, )): # rendered at parent level continue tei_section.extend( context.get_tei_child_elements_for_semantic_content( child_semantic_content)) if semantic_content.section_type == SemanticSectionTypes.ACKNOWLEDGEMENT: tei_section.element.attrib['type'] = 'acknowledgement' if not list(tei_section.element): return [] return [tei_section.element]
def get_tei_element_for_semantic_content( self, semantic_content: SemanticContentWrapper, context: TeiElementFactoryContext) -> etree.ElementBase: LOGGER.debug('semantic_content: %s', semantic_content) assert isinstance(semantic_content, SemanticHeading) semantic_heading = semantic_content children: T_ElementChildrenList = [ context.get_default_attributes_for_semantic_content( semantic_heading) ] pending_whitespace = '' for child_semantic_content in semantic_heading: if isinstance(child_semantic_content, SemanticLabel): children.append({'n': child_semantic_content.get_text()}) continue layout_block = child_semantic_content.merged_block if pending_whitespace: children.append(pending_whitespace) children.extend( context.iter_layout_block_tei_children( layout_block=layout_block, enable_coordinates=False)) pending_whitespace = layout_block.whitespace return TEI_E('head', *children)
def set_abstract_layout_block(self, abstract_block: LayoutBlock): self.set_child_element_at( ['teiHeader', 'profileDesc', 'abstract'], TEI_E('p', *iter_layout_block_tei_children(abstract_block)))
def create_section(self) -> TeiSection: return TeiSection(TEI_E('div'))
def create_paragraph(self) -> TeiSectionParagraph: return TeiSectionParagraph(TEI_E('p'))
def set_title(self, title: str): self.set_child_element_at(['teiHeader', 'fileDesc', 'titleStmt'], TEI_E('title', title, level="a", type="main"))
def set_abstract(self, abstract: str): self.set_child_element_at(['teiHeader', 'profileDesc', 'abstract'], TEI_E('p', abstract))