def test_should_filter_by_token_label(self):
     tagged_tokens = [(TAG_1, get_layout_tokens_for_text('this is line 1')),
                      (TAG_2, get_layout_tokens_for_text('this is line 2'))]
     line = LayoutLine(
         [token for _, tokens in tagged_tokens for token in tokens])
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=token.text,
                          layout_line=line,
                          layout_token=token)
         for tag, tokens in tagged_tokens for token in tokens
     ]
     layout_document = LayoutDocument(
         pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     for tag, tokens in tagged_tokens:
         assert (join_layout_tokens(
             layout_document_label_result.get_filtered_document_by_label(
                 tag).iter_all_tokens()) == join_layout_tokens(tokens))
 def test_should_filter_by_line_without_token(self):
     tagged_lines = [(TAG_1, LayoutLine.for_text('this is line 1')),
                     (TAG_2, LayoutLine.for_text('this is line 2'))]
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=line.text,
                          layout_line=line,
                          layout_token=None) for tag, line in tagged_lines
         for token in line.tokens
     ]
     layout_document = LayoutDocument(pages=[
         LayoutPage(
             blocks=[LayoutBlock(lines=[line for _, line in tagged_lines])])
     ])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     for tag, line in tagged_lines:
         assert (join_layout_tokens(
             layout_document_label_result.get_filtered_document_by_label(
                 tag).iter_all_tokens()) == join_layout_tokens(line.tokens))
 def test_should_filter_by_token_multiple_labels(self):
     tagged_tokens = [(TAG_1, get_layout_tokens_for_text('tokens tag 1')),
                      (TAG_2, get_layout_tokens_for_text('tokens tag 2')),
                      (TAG_3, get_layout_tokens_for_text('tokens tag 3'))]
     line = LayoutLine(
         [token for _, tokens in tagged_tokens for token in tokens])
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=token.text,
                          layout_line=line,
                          layout_token=token)
         for tag, tokens in tagged_tokens for token in tokens
     ]
     layout_document = LayoutDocument(
         pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     assert join_layout_tokens(
         layout_document_label_result.get_filtered_document_by_labels([
             TAG_1, TAG_3
         ]).iter_all_tokens()) == join_layout_tokens(tagged_tokens[0][1] +
                                                     tagged_tokens[2][1])
 def test_should_return_remove_abstract_colon_prefix(self):
     layout_block = LayoutBlock.for_text('Abstract: ' + ABSTRACT_1)
     cleaned_layout_block = get_cleaned_abstract_layout_block(layout_block)
     assert join_layout_tokens(
         cleaned_layout_block.lines[0].tokens) == ABSTRACT_1
 def test_should_return_abstract_if_it_doesnt_contain_prefix(self):
     layout_block = LayoutBlock.for_text(ABSTRACT_1)
     cleaned_layout_block = get_cleaned_abstract_layout_block(layout_block)
     assert join_layout_tokens(
         cleaned_layout_block.lines[0].tokens) == ABSTRACT_1
Beispiel #6
0
 def write_xml_line_for_layout_tokens(self, xml_writer: XmlTreeWriter,
                                      layout_tokens: Iterable[LayoutToken]):
     xml_writer.append_text(join_layout_tokens(layout_tokens))
     xml_writer.append(TEI_E('lb'))