def test_should_include_layout_document_text_in_tei_output(self): layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = get_training_tei_xml_for_layout_document(layout_document) nodes = tei_xpath(xml_root, FIGURE_XPATH) assert len(nodes) == 1 assert get_text_content(nodes[0]).rstrip() == TEXT_1
def test_should_include_layout_document_text_in_tei_output(self): layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = get_training_tei_xml_for_layout_document(layout_document) assert xml_root.tag == f'{TEI_NS_PREFIX}TEI' nodes = tei_xpath(xml_root, AUTHOR_XPATH) assert len(nodes) == 1 assert get_text_content(nodes[0]).rstrip() == TEXT_1
def get_post_processed_xml_root(self, xml_root: etree.ElementBase): for idno_element in tei_xpath(xml_root, '//tei:idno'): external_identifier_type = get_detected_external_identifier_type_for_text( get_text_content(idno_element)) if not external_identifier_type: continue idno_element.attrib['type'] = external_identifier_type return xml_root
def test_should_keep_original_whitespace(self): text = 'Token1, Token2 ,Token3' layout_document = LayoutDocument.for_blocks([ LayoutBlock( lines=[LayoutLine.for_text(text, tail_whitespace='\n')]) ]) xml_root = get_training_tei_xml_for_layout_document(layout_document) nodes = tei_xpath(xml_root, AUTHOR_XPATH) assert len(nodes) == 1 assert get_text_content(nodes[0]).rstrip() == text
def test_should_include_layout_document_text_in_tei_output(self): training_data_generator = SegmentationTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == TEXT_1
def load_xml_lookup_from_file( filename: str ) -> TextLookUp: root = etree.parse(filename) valid_texts = { get_text_content(node) for node in root.xpath('//tei:cell', namespaces=TEI_NS_MAP) } LOGGER.debug('valid_texts: %s', valid_texts) return SimpleTextLookUp(valid_texts)
def test_should_include_layout_document_text_in_tei_output(self): training_data_generator = get_tei_training_data_generator() layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) LOGGER.debug('xml: %r', etree.tostring(xml_root)) text_nodes = tei_xpath(xml_root, './text/listBibl') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == TEXT_1
def test_should_include_layout_document_text_in_tei_output(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == TEXT_1
def test_should_add_line_feeds(self): layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = get_training_tei_xml_for_layout_document(layout_document) nodes = tei_xpath(xml_root, AUTHOR_XPATH) assert len(nodes) == 1 assert get_text_content(nodes[0]).rstrip() == '\n'.join( [TEXT_1, TEXT_2])
def test_should_keep_original_whitespace(self): training_data_generator = get_tei_training_data_generator() text = 'Token1, Token2 ,Token3' layout_document = LayoutDocument.for_blocks([ LayoutBlock( lines=[LayoutLine.for_text(text, tail_whitespace='\n')]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text/listBibl') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == text
def test_should_keep_original_whitespace(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() text = 'Token1, Token2 ,Token3' layout_document = LayoutDocument.for_blocks([ LayoutBlock( lines=[LayoutLine.for_text(text, tail_whitespace='\n')]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == text
def test_should_map_unknown_label_to_note(self): label_and_layout_line_list = [('<unknown>', get_next_layout_line_for_text(TEXT_1))] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( labeled_model_data_list, ) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:note[@type="unknown"]') == [TEXT_1] assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n'
def test_should_add_line_feeds(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == '\n'.join( [TEXT_1, TEXT_2])
def test_should_add_line_feeds(self): training_data_generator = get_tei_training_data_generator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text/listBibl') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == '\n'.join( [TEXT_1, TEXT_2])
def test_should_not_join_separate_labels(self): label_and_layout_line_list = [ ('<institution>', get_next_layout_line_for_text(TEXT_1)), ('<institution>', get_next_layout_line_for_text(TEXT_2)) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="institution"]') == [TEXT_1, TEXT_2] assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n{TEXT_2}\n'
def _get_text(xml, xpath: str): item = _get_item(xml, xpath) try: return get_text_content(item) except AttributeError: return str(item)
def get_tei_xpath_text_content_list(parent: etree.ElementBase, xpath: str) -> List[str]: return [get_text_content(node) for node in tei_xpath(parent, xpath)]
def test_should_return_text_of_simple_element(self): assert get_text_content(E.parent('text 1')) == 'text 1'