def test_should_include_layout_document_text_in_tei_output(self): training_data_generator = SegmentationTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == TEXT_1
def test_should_include_layout_document_text_in_tei_output(self): training_data_generator = get_tei_training_data_generator() layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) LOGGER.debug('xml: %r', etree.tostring(xml_root)) text_nodes = tei_xpath(xml_root, './text/listBibl') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == TEXT_1
def test_should_include_layout_document_text_in_tei_output(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == TEXT_1
def test_should_keep_original_whitespace(self): training_data_generator = get_tei_training_data_generator() text = 'Token1, Token2 ,Token3' layout_document = LayoutDocument.for_blocks([ LayoutBlock( lines=[LayoutLine.for_text(text, tail_whitespace='\n')]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text/listBibl') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == text
def test_should_keep_original_whitespace(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() text = 'Token1, Token2 ,Token3' layout_document = LayoutDocument.for_blocks([ LayoutBlock( lines=[LayoutLine.for_text(text, tail_whitespace='\n')]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == text
def test_should_add_line_feeds(self): training_data_generator = get_tei_training_data_generator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text/listBibl') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == '\n'.join( [TEXT_1, TEXT_2])
def test_should_add_line_feeds(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == '\n'.join( [TEXT_1, TEXT_2])
def test_should_lb_elements_before_line_feeds(self): training_data_generator = get_tei_training_data_generator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text/listBibl') assert len(text_nodes) == 1 lb_nodes = text_nodes[0].xpath('lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def test_should_lb_elements_before_line_feeds(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def get_training_tei_xml_for_layout_document( layout_document: LayoutDocument) -> etree.ElementBase: return get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator()))