def test_should_preserve_empty_pages_if_requested(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])])
         ],
                    graphics=[]),
         LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])],
                    graphics=[]),
     ])
     cleaned_layout_document = remove_empty_blocks(
         layout_document, preserve_empty_pages=True)
     assert len(cleaned_layout_document.pages) == 2
 def test_should_remove_empty_line_block_and_page(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])])
         ],
                    graphics=[]),
         LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])],
                    graphics=[]),
     ])
     cleaned_layout_document = remove_empty_blocks(layout_document)
     assert len(cleaned_layout_document.pages) == 1
     line = cleaned_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1']
 def test_should_not_return_section_label_if_block_is_empty(self):
     section_heading_layout_block = LayoutBlock(lines=[])
     (section_label_layout_block, section_title_layout_block
      ) = get_section_label_and_title_from_layout_block(
          section_heading_layout_block)
     assert section_label_layout_block is None
     assert section_title_layout_block == section_heading_layout_block
Example #4
0
 def test_should_provide_page_and_block_status_for_multi_line_blocks(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[
                 LayoutLine.for_text('line1'),
                 LayoutLine.for_text('line2'),
                 LayoutLine.for_text('line3')
             ])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'page_status': features.get_page_status(),
             'block_status': features.get_block_status()
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [{
         'page_status': 'PAGESTART',
         'block_status': 'BLOCKSTART'
     }, {
         'page_status': 'PAGEIN',
         'block_status': 'BLOCKIN'
     }, {
         'page_status': 'PAGEEND',
         'block_status': 'BLOCKEND'
     }]
Example #5
0
 def test_should_provide_line_text(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[
                 LayoutLine.for_text('first1 second1 this is a line'),
                 LayoutLine.for_text('first2 second2 this is a line')
             ])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'line_text':
             features.line_text,
             'token_text':
             features.token_text,
             'second_token_text':
             features.second_token_text
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'line_text': 'first1 second1 this is a line',
             'token_text': 'first1',
             'second_token_text': 'second1'
         },
         {
             'line_text': 'first2 second2 this is a line',
             'token_text': 'first2',
             'second_token_text': 'second2'
         },
     ]
Example #6
0
 def test_should_provide_block_relative_line_length(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[
                 LayoutLine.for_text('1'),
                 LayoutLine.for_text('12'),
                 LayoutLine.for_text('1234567890'),
             ])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'str_block_relative_line_length_feature':
             (features.get_str_block_relative_line_length_feature())
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'str_block_relative_line_length_feature': '1',  # 1 * 10 / 10
         },
         {
             'str_block_relative_line_length_feature': '2',  # 2 * 10 / 10
         },
         {
             'str_block_relative_line_length_feature': '10',  # 10 * 10 / 10
         },
     ]
 def test_should_return_original_block_for_empty_block(self):
     layout_block = LayoutBlock(lines=[])
     prefix_block, cleaned_block, suffix_block = (
         get_regex_cleaned_layout_block_with_prefix_suffix(
             layout_block, r'other'))
     assert not prefix_block.lines
     assert cleaned_block == layout_block
     assert not suffix_block.lines
Example #8
0
 def get_filtered_document_by_labels(self, labels: List[str]):  # pylint: disable=too-many-branches
     layout_document = LayoutDocument(pages=[])
     layout_document_labels = self.get_layout_document_labels_by_labels(
         labels)
     if not layout_document_labels:
         LOGGER.warning(
             'no layout_lines_to_include found for: %r, available keys=%r',
             labels, self.layout_document_labels_by_label.keys())
         return layout_document
     layout_token_ids_to_include = {
         id(layout_document_label.layout_token)
         for layout_document_label in layout_document_labels
         if layout_document_label.layout_token
     }
     LOGGER.debug('layout_tokens_to_include: %s',
                  layout_token_ids_to_include)
     layout_line_ids_to_include: Set[int] = set()
     if not layout_token_ids_to_include:
         layout_line_ids_to_include = {
             id(layout_document_label.layout_line)
             for layout_document_label in layout_document_labels
             if layout_document_label.layout_line
         }
     LOGGER.debug('layout_line_ids_to_include: %s',
                  layout_line_ids_to_include)
     result_page: Optional[LayoutPage] = None
     for page in self.layout_document.pages:  # pylint: disable=too-many-nested-blocks
         result_page = None
         result_block: Optional[LayoutBlock] = None
         for block in page.blocks:
             result_block = None
             for line in block.lines:
                 accepted_line: Optional[LayoutLine] = None
                 if layout_token_ids_to_include:
                     accepted_tokens: List[LayoutToken] = []
                     for token in line.tokens:
                         if id(token) in layout_token_ids_to_include:
                             accepted_tokens.append(token)
                     if not accepted_tokens:
                         continue
                     if len(line.tokens) == accepted_tokens:
                         accepted_line = line
                     else:
                         accepted_line = LayoutLine(tokens=accepted_tokens)
                 else:
                     if id(line) not in layout_line_ids_to_include:
                         continue
                     accepted_line = line
                 if result_page is None:
                     result_page = LayoutPage(blocks=[])
                     layout_document.pages.append(result_page)
                 if result_block is None:
                     result_block = LayoutBlock(lines=[])
                     result_page.blocks.append(result_block)
                 result_block.lines.append(accepted_line)
     return layout_document
 def test_should_keep_original_whitespace(self):
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == text
 def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics(
         self):
     page_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(0, 0, 200, 200), page_number=1)
     semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 90, 100, 50), page_number=1)
     keep_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 10, 100, 20), page_number=1)
     remove_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 100, 20), page_number=1)
     empty_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 0, 0), page_number=1)
     keep_token = LayoutToken('keep', coordinates=keep_coordinates)
     remove_token = LayoutToken('remove', coordinates=remove_coordinates)
     keep_graphic = LayoutGraphic(coordinates=keep_coordinates,
                                  graphic_type='keep-graphic')
     remove_graphic = LayoutGraphic(coordinates=remove_coordinates,
                                    graphic_type='remove-graphic')
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(
                 lines=[LayoutLine(tokens=[keep_token, remove_token])])
         ],
                    graphics=[keep_graphic, remove_graphic],
                    meta=LayoutPageMeta(
                        page_number=page_coordinates.page_number,
                        coordinates=page_coordinates))
     ])
     layout_graphic = LayoutGraphic(
         coordinates=semantic_graphic_coordinates,
         graphic_type='new-graphic')
     no_coords_layout_graphic = LayoutGraphic(
         coordinates=empty_coordinates, graphic_type='empty-coords-graphic')
     result = get_layout_document_with_text_and_graphics_replaced_by_graphics(
         layout_document,
         semantic_graphics=[
             SemanticGraphic(layout_graphic=layout_graphic),
             SemanticGraphic(layout_graphic=no_coords_layout_graphic)
         ])
     LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics)
     assert result.pages[0].graphics[:-1] == [keep_graphic]
     LOGGER.debug('result.pages[0].graphics[-1]: %r',
                  result.pages[0].graphics[-1])
     assert result.pages[0].graphics[
         -1].graphic_type == layout_graphic.graphic_type
     assert result.pages[0].graphics[
         -1].coordinates == layout_graphic.coordinates
     assert list(
         result.pages[0].blocks[0].iter_all_tokens()) == [keep_token]
     assert list(
         result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [
             keep_token, remove_token
         ]
 def test_should_add_line_feeds(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])
 def test_should_keep_original_whitespace(self):
     training_data_generator = get_tei_training_data_generator()
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     text_nodes = xml_root.xpath('./text/listBibl')
     assert len(text_nodes) == 1
     assert get_text_content(text_nodes[0]).rstrip() == text
Example #13
0
 def test_should_keep_original_whitespace(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == text
 def test_should_lb_elements_before_line_feeds(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     lb_nodes = tei_xpath(nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
Example #15
0
 def test_should_add_line_feeds(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])
 def test_should_add_line_feeds(self):
     training_data_generator = get_tei_training_data_generator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     text_nodes = xml_root.xpath('./text/listBibl')
     assert len(text_nodes) == 1
     assert get_text_content(text_nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])
def get_labeled_model_data_list(
        label_and_layout_line_list: Sequence[Tuple[str, LayoutLine]],
        data_generator: ModelDataGenerator
) -> Sequence[LabeledLayoutModelData]:
    labeled_model_data_list = []
    for label, layout_line in label_and_layout_line_list:
        layout_document = LayoutDocument.for_blocks(
            [LayoutBlock(lines=[layout_line])])
        labeled_model_data_list.extend([
            LabeledLayoutModelData.from_model_data(model_data,
                                                   label=get_label_with_prefix(
                                                       label, index=index))
            for index, model_data in enumerate(
                data_generator.iter_model_data_for_layout_document(
                    layout_document))
        ])
    return labeled_model_data_list
Example #18
0
 def test_should_lb_elements_before_line_feeds(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
 def test_should_lb_elements_before_line_feeds(self):
     training_data_generator = get_tei_training_data_generator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     text_nodes = xml_root.xpath('./text/listBibl')
     assert len(text_nodes) == 1
     lb_nodes = text_nodes[0].xpath('lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
 def test_should_generate_tei_from_model_data(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             get_next_layout_line_for_text(TEXT_1),
             get_next_layout_line_for_text(TEXT_2)
         ])
     ])
     data_generator = get_data_generator()
     model_data_iterable = data_generator.iter_model_data_for_layout_document(
         layout_document)
     xml_root = get_training_tei_xml_for_model_data_iterable(
         model_data_iterable)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     lb_nodes = tei_xpath(nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
Example #21
0
 def test_should_generate_tei_from_model_data(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             get_next_layout_line_for_text(TEXT_1),
             get_next_layout_line_for_text(TEXT_2)
         ])
     ])
     data_generator = get_data_generator()
     model_data_iterable = data_generator.iter_model_data_for_layout_document(
         layout_document)
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         model_data_iterable)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
Example #22
0
 def test_should_filter_by_line_without_token(self):
     tagged_lines = [(TAG_1, LayoutLine.for_text('this is line 1')),
                     (TAG_2, LayoutLine.for_text('this is line 2'))]
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=line.text,
                          layout_line=line,
                          layout_token=None) for tag, line in tagged_lines
         for token in line.tokens
     ]
     layout_document = LayoutDocument(pages=[
         LayoutPage(
             blocks=[LayoutBlock(lines=[line for _, line in tagged_lines])])
     ])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     for tag, line in tagged_lines:
         assert (join_layout_tokens(
             layout_document_label_result.get_filtered_document_by_label(
                 tag).iter_all_tokens()) == join_layout_tokens(line.tokens))
Example #23
0
 def test_should_filter_by_token_label(self):
     tagged_tokens = [(TAG_1, get_layout_tokens_for_text('this is line 1')),
                      (TAG_2, get_layout_tokens_for_text('this is line 2'))]
     line = LayoutLine(
         [token for _, tokens in tagged_tokens for token in tokens])
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=token.text,
                          layout_line=line,
                          layout_token=token)
         for tag, tokens in tagged_tokens for token in tokens
     ]
     layout_document = LayoutDocument(
         pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     for tag, tokens in tagged_tokens:
         assert (join_layout_tokens(
             layout_document_label_result.get_filtered_document_by_label(
                 tag).iter_all_tokens()) == join_layout_tokens(tokens))
Example #24
0
 def test_should_provide_block_relative_document_token_position(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(
                 lines=[LayoutLine.for_text(f'line{i}') for i in range(10)])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'str_relative_document_position':
             (features.get_str_relative_document_position())
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [{
         'str_relative_document_position':
         str(feature_linear_scaling_int(i, 10, NBBINS_POSITION)),
     } for i in range(10)]
 def test_should_generate_tei_from_model_data(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             get_next_layout_line_for_text(TEXT_1),
             get_next_layout_line_for_text(TEXT_2)
         ])
     ])
     data_generator = get_data_generator()
     model_data_iterable = data_generator.iter_model_data_for_layout_document(
         layout_document)
     training_data_generator = get_tei_training_data_generator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         model_data_iterable)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     text_nodes = xml_root.xpath('./text/listBibl')
     assert len(text_nodes) == 1
     lb_nodes = text_nodes[0].xpath('lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
 def test_should_generate_tei_from_model_data(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             get_next_layout_line_for_text(TEXT_1),
             get_next_layout_line_for_text(TEXT_2)
         ])
     ])
     data_generator = SegmentationDataGenerator(
         DEFAULT_DOCUMENT_FEATURES_CONTEXT, use_first_token_of_block=True)
     model_data_iterable = data_generator.iter_model_data_for_layout_document(
         layout_document)
     training_data_generator = SegmentationTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         model_data_iterable)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     text_nodes = xml_root.xpath('./text')
     assert len(text_nodes) == 1
     lb_nodes = text_nodes[0].xpath('lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
Example #27
0
 def test_should_filter_by_token_multiple_labels(self):
     tagged_tokens = [(TAG_1, get_layout_tokens_for_text('tokens tag 1')),
                      (TAG_2, get_layout_tokens_for_text('tokens tag 2')),
                      (TAG_3, get_layout_tokens_for_text('tokens tag 3'))]
     line = LayoutLine(
         [token for _, tokens in tagged_tokens for token in tokens])
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=token.text,
                          layout_line=line,
                          layout_token=token)
         for tag, tokens in tagged_tokens for token in tokens
     ]
     layout_document = LayoutDocument(
         pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     assert join_layout_tokens(
         layout_document_label_result.get_filtered_document_by_labels([
             TAG_1, TAG_3
         ]).iter_all_tokens()) == join_layout_tokens(tagged_tokens[0][1] +
                                                     tagged_tokens[2][1])
Example #28
0
 def test_should_provide_punctuation_profile(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(
             blocks=[LayoutBlock(lines=[
                 LayoutLine.for_text('a .: b'),
             ])])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'line_punctuation_profile':
             features.get_line_punctuation_profile(),
             'line_punctuation_profile_length_feature':
             (features.get_line_punctuation_profile_length_feature()),
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'line_punctuation_profile': '.:',
             'line_punctuation_profile_length_feature': '2'
         },
     ]
 def add_content(self, block: LayoutBlock):
     self.content = LayoutBlock(lines=self.content.lines + block.lines)
Example #30
0
 def test_should_return_empty_str_if_passed_in_text_was_empty(self):
     layout_block = LayoutBlock(lines=[])
     assert get_cleaned_abstract_layout_block(layout_block) == layout_block