def test_should_merge_single_token_and_add_prefix(self):
     merged_structured_document = SimpleStructuredDocument(lines=[SimpleLine([
         SimpleToken(TEXT_1, tag=TAG_1)
     ])])
     other_structured_document = SimpleStructuredDocument(lines=[SimpleLine([
         SimpleToken(TEXT_1, tag=TAG_2)
     ])])
     merged_structured_document.merge_with(
         other_structured_document,
         partial(
             merge_token_tag,
             target_scope=SCOPE_1
         )
     )
     merged_tokens = list(merged_structured_document.iter_all_tokens())
     assert (
         [merged_structured_document.get_text(t) for t in merged_tokens] ==
         [TEXT_1]
     )
     assert (
         [merged_structured_document.get_tag(t) for t in merged_tokens] ==
         [TAG_1]
     )
     assert (
         [merged_structured_document.get_tag(t, scope=SCOPE_1) for t in merged_tokens] ==
         [TAG_2]
     )
 def test_should_return_ratio_and_count_of_tagged_tokens(self):
     tagged_tokens = [
         SimpleToken('this'),
         SimpleToken('is'),
         SimpleToken('tagged')
     ]
     not_tagged_tokens = [
         SimpleToken('this'),
         SimpleToken('isn\'t')
     ]
     doc = SimpleStructuredDocument(lines=[SimpleLine(
         tagged_tokens + not_tagged_tokens
     )])
     for token in tagged_tokens:
         doc.set_tag(token, TAG1)
     num_total = len(tagged_tokens) + len(not_tagged_tokens)
     results = evaluate_document_by_page(doc)
     assert results == [{
         'count': {
             TAG1: len(tagged_tokens),
             None: len(not_tagged_tokens)
         },
         'percentage': {
             TAG1: len(tagged_tokens) / num_total,
             None: len(not_tagged_tokens) / num_total
         }
     }]
    def test_should_merge_doc_and_scope_cv_tag(
            self, load_structured_document_mock,
            structured_document_to_token_props_mock):

        structured_document = SimpleStructuredDocument(
            lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])])
        cv_structured_document = SimpleStructuredDocument(lines=[
            SimpleLine(
                [SimpleToken(TEXT_1, tag=TAG_2, tag_scope=CV_TAG_SCOPE)])
        ])
        load_structured_document_mock.side_effect = [
            structured_document, cv_structured_document
        ]
        load_and_convert_to_token_props(FILE_1,
                                        FILE_2,
                                        cv_source_tag_scope=CV_TAG_SCOPE,
                                        page_range=PAGE_RANGE)
        load_structured_document_mock.assert_any_call(FILE_1,
                                                      page_range=PAGE_RANGE)
        structured_document_arg = structured_document_to_token_props_mock.call_args[
            0][0]
        assert [
            structured_document_arg.get_tag_by_scope(t)
            for t in structured_document_arg.iter_all_tokens()
        ] == [{
            None: TAG_1,
            CV_TAG_SCOPE: TAG_2
        }]
Esempio n. 4
0
 def test_should_raise_assertion_error_if_tokens_mismatch(self):
     merged_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])])
     other_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_2, tag=TAG_2)])])
     with pytest.raises(AssertionError):
         merged_structured_document.merge_with(
             other_structured_document,
             partial(merge_token_tag, target_scope=SCOPE_1))
def _token_with_sub_tag(text,
                        tag=None,
                        tag_prefix=None,
                        sub_tag=None,
                        sub_tag_prefix=None):
    token = SimpleToken(text, tag=tag, tag_prefix=tag_prefix)
    if sub_tag:
        token.set_tag(sub_tag, prefix=sub_tag_prefix, level=2)
    return token
Esempio n. 6
0
 def test_should_strip_prefix(self):
     tagged_tokens = [
         SimpleToken('this', tag=TAG1, tag_prefix=B_TAG_PREFIX),
         SimpleToken('is', tag=TAG1, tag_prefix=I_TAG_PREFIX),
         SimpleToken('tagged', tag=TAG1, tag_prefix=I_TAG_PREFIX)
     ]
     doc = SimpleStructuredDocument(lines=[SimpleLine(tagged_tokens)])
     results = evaluate_document_by_page(doc)
     assert set(results[0]['count'].keys()) == {TAG1}
Esempio n. 7
0
 def test_should_not_override_with_empty_tags(self):
     merged_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])])
     other_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_1)])])
     merged_structured_document.merge_with(other_structured_document,
                                           partial(merge_token_tag))
     merged_tokens = list(merged_structured_document.iter_all_tokens())
     assert ([merged_structured_document.get_tag(t)
              for t in merged_tokens] == [TAG_1])
 def test_should_match_normalised_characters(self):
     matching_tokens = [
         SimpleToken('this'),
         SimpleToken('is' + THIN_SPACE + EN_DASH + EM_DASH),
         SimpleToken('matching')
     ]
     target_annotations = [TargetAnnotation('this is -- matching', TAG1)]
     doc = _document_for_tokens([matching_tokens])
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
 def test_should_return_multiple_token_texts(self):
     structured_document = SimpleStructuredDocument(
         SimplePage([SimpleLine([
             SimpleToken(TEXT_1),
             SimpleToken(TEXT_2),
             SimpleToken(TEXT_3)
         ])], bounding_box=PAGE_BOUNDING_BOX)
     )
     result = list(structured_document_to_token_props(
         structured_document
     ))
     assert [t.get('text') for t in result] == [TEXT_1, TEXT_2, TEXT_3]
Esempio n. 10
0
 def test_should_not_fail_with_absent_tags(self):
     merged_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])])
     other_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_1)])])
     merged_structured_document.merge_with(
         other_structured_document,
         partial(merge_token_tag, target_scope=SCOPE_1))
     merged_tokens = list(merged_structured_document.iter_all_tokens())
     assert ([
         merged_structured_document.get_tag(t, scope=SCOPE_1)
         for t in merged_tokens
     ] == [None])
Esempio n. 11
0
 def test_should_return_tag(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([
             SimpleLine(
                 [SimpleToken(TEXT_1, tag=TAG_1),
                  SimpleToken(TEXT_2)])
         ],
                    bounding_box=PAGE_BOUNDING_BOX),
         SimplePage([SimpleLine([SimpleToken(TEXT_3, tag=TAG_3)])],
                    bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(structured_document))
     assert [t.get('tag') for t in result] == [TAG_1, None, TAG_3]
 def test_should_merge_from_cv_tag_scope(self):
     structured_document = SimpleStructuredDocument(lines=[SimpleLine([
         SimpleToken(TEXT_1, tag_scope=None, tag=TAG_1)
     ])])
     cv_structured_document = SimpleStructuredDocument(lines=[SimpleLine([
         SimpleToken(TEXT_1, tag_scope=CV_TAG_SCOPE, tag=TAG_2)
     ])])
     structured_document = merge_with_cv_structured_document(
         structured_document, cv_structured_document,
         cv_source_tag_scope=CV_TAG_SCOPE
     )
     assert get_all_token_tags(structured_document) == [TAG_1]
     assert get_all_token_tags(structured_document, scope=CV_TAG_SCOPE) == [TAG_2]
    def test_should_strip_tag_prefix(self):
        token = SimpleToken('test',
                            tag=TAG1,
                            tag_prefix=B_TAG_PREFIX,
                            bounding_box=DEFAULT_BOUNDING_BOX)
        assert token.get_tag() == B_TAG_PREFIX + TAG1
        structured_document = SimpleStructuredDocument(
            lines=[SimpleLine([token])])

        blocks = annotation_document_page_to_annotation_blocks(
            structured_document,
            structured_document.get_pages()[0])
        assert [b.tag for b in blocks] == [TAG1]
Esempio n. 14
0
 def test_should_return_line_token_index_and_page_count(self):
     structured_document = SimpleStructuredDocument([
         SimplePage(
             [SimpleLine([SimpleToken(TEXT_1),
                          SimpleToken(TEXT_2)])],
             bounding_box=PAGE_BOUNDING_BOX),
         SimplePage([SimpleLine([SimpleToken(TEXT_3)])],
                    bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(structured_document))
     line_tokens = [t.get('line_token') for t in result]
     assert [t.get('index') for t in line_tokens] == [0, 1, 0]
     assert [t.get('count') for t in line_tokens] == [2, 2, 1]
 def test_should_return_scoped_tags(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([SimpleLine([
             SimpleToken(TEXT_1, tag=TAG_1),
             SimpleToken(TEXT_2)
         ])], bounding_box=PAGE_BOUNDING_BOX),
         SimplePage([SimpleLine([
             SimpleToken(TEXT_3, tag=TAG_3, tag_scope=SCOPE_1)
         ])], bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(
         structured_document
     ))
     assert [t.get('scoped_tags') for t in result] == [{}, {}, {SCOPE_1: TAG_3}]
Esempio n. 16
0
 def test_should_not_return_line_number_tokens_if_not_line(self):
     line_number_tokens = [
         SimpleToken(str(line_no), dict(x=str(30), y=str(line_no * 20)))
         for line_no in range(1, 5)
     ]
     doc = SimpleStructuredDocument(lines=[
         SimpleLine([
             line_number_token,
             SimpleToken('other text',
                         dict(x=str(20), y=line_number_token.get_y()))
         ]) for line_number_token in line_number_tokens
     ])
     expected_line_number_tokens = []
     actual_line_number_tokens = list(find_line_number_tokens(doc))
     assert actual_line_number_tokens == expected_line_number_tokens
 def test_should_annotate_ignoring_space_after_dot_short_sequence(self):
     matching_tokens = [SimpleToken('A.B.,')]
     target_annotations = [TargetAnnotation('A. B.', TAG1)]
     doc = _document_for_tokens([matching_tokens])
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
Esempio n. 18
0
 def test_should_tag_single_token_within_partial_prediction_at_smaller_scale(self):
     token_1 = SimpleToken(TOKEN_TEXT_1)
     structured_document = SimpleStructuredDocument(lines=[SimpleLine([token_1])])
     structured_document.set_bounding_box(
         structured_document.get_pages()[0],
         BoundingBox(0, 0, DEFAULT_WIDTH * 100, DEFAULT_HEIGHT * 100)
     )
     structured_document.set_bounding_box(
         token_1,
         BoundingBox(0, 0, DEFAULT_WIDTH * 10, DEFAULT_HEIGHT * 10)
     )
     annotated_image = filled_image(
         BG_COLOR, {TAG_1: COLOR_1},
         width=DEFAULT_WIDTH * 10,
         height=DEFAULT_HEIGHT * 10
     )
     fill_rect(
         annotated_image,
         BoundingBox(0, 0, DEFAULT_WIDTH, DEFAULT_HEIGHT),
         COLOR_1
     )
     annotate_structured_document_using_predicted_images(
         structured_document,
         [annotated_image]
     )
     assert structured_document.get_tag(token_1, scope=CV_TAG_SCOPE) == TAG_1
Esempio n. 19
0
 def test_should_return_all_tag_by_scope(self):
     token = SimpleToken(TEXT_1)
     doc = SimpleStructuredDocument(lines=[SimpleLine([token])])
     doc.set_tag(token, TAG_1)
     doc.set_tag(token, TAG_2, scope=SCOPE_1)
     assert doc.get_tag(token) == TAG_1
     assert doc.get_tag(token, scope=SCOPE_1) == TAG_2
     assert doc.get_tag_by_scope(token) == {None: TAG_1, SCOPE_1: TAG_2}
 def test_should_not_tag_using_none_tag(self):
     token_1 = SimpleToken(TOKEN_TEXT_1)
     structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([token_1])])
     annotate_structured_document_using_predictions(structured_document,
                                                    [NONE_TAG])
     assert structured_document.get_tag(token_1,
                                        scope=CRF_TAG_SCOPE) is None
 def test_should_extract_from_different_tag_scope(self):
     lines = [
         SimpleLine([SimpleToken(TEXT_1, tag=TAG_1, tag_scope=TAG_SCOPE_1)])
     ]
     structured_document = SimpleStructuredDocument(lines=lines)
     result = [(x.tag, x.text) for x in extract_from_annotated_document(
         structured_document, tag_scope=TAG_SCOPE_1)]
     assert result == [(TAG_1, TEXT_1)]
 def test_x(self):
     line_number_tokens = [
         SimpleToken(str(line_no), dict(x='1', y=str(line_no * 20)))
         for line_no in range(1, 5)
     ]
     doc = SimpleStructuredDocument(lines=[
         SimpleLine([
             line_number_token,
             SimpleToken(
                 'other text',
                 dict(x=str(float(line_number_token.get_x()) + 50),
                      y=line_number_token.get_y()))
         ]) for line_number_token in line_number_tokens
     ])
     line_annotator.annotate(doc)
     assert [t.get_tag() for t in line_number_tokens
             ] == ['line_no'] * len(line_number_tokens)
Esempio n. 23
0
    def test_should_learn_similar_sequence(self):
        structured_document_train = SimpleStructuredDocument(
            SimplePage([
                SimpleLine([
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_2, tag=TAG_2),
                    SimpleToken(TEXT_3, tag=TAG_3)
                ])
            ],
                       bounding_box=PAGE_BOUNDING_BOX))
        structured_document_test = SimpleStructuredDocument(
            SimplePage([
                SimpleLine([
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_2, tag=TAG_2),
                    SimpleToken(TEXT_3, tag=TAG_3)
                ])
            ],
                       bounding_box=PAGE_BOUNDING_BOX))
        token_props_list_train = list(
            structured_document_to_token_props(structured_document_train))
        X_train = [token_props_list_to_features(token_props_list_train)]
        y_train = [token_props_list_to_labels(token_props_list_train)]

        token_props_list_test = list(
            structured_document_to_token_props(structured_document_test))
        X_test = [token_props_list_to_features(token_props_list_test)]
        y_test = [token_props_list_to_labels(token_props_list_test)]

        with create_crf_suite_model() as model:
            model.fit(X_train, y_train)
            y_predicted = model.predict(X_test)
            assert y_predicted == y_test
Esempio n. 24
0
 def test_should_return_page_width_and_height(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([SimpleLine([SimpleToken(TEXT_1)])],
                    bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(structured_document))
     pages = [t.get('page') for t in result]
     assert [p.get('width') for p in pages] == [PAGE_BOUNDING_BOX.width]
     assert [p.get('height') for p in pages] == [PAGE_BOUNDING_BOX.height]
 def test_should_not_tag_single_token_not_within_prediction(self):
     token_1 = SimpleToken(TOKEN_TEXT_1)
     structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([token_1])])
     structured_document.set_bounding_box(
         structured_document.get_pages()[0], DEFAULT_BOUNDING_BOX)
     structured_document.set_bounding_box(token_1, DEFAULT_BOUNDING_BOX)
     annotate_structured_document_using_predicted_images(
         structured_document, [filled_image(BG_COLOR, {TAG_1: COLOR_1})])
     assert structured_document.get_tag(token_1, scope=CV_TAG_SCOPE) is None
    def test_should_ignore_block_without_bounding_box(self):
        token = SimpleToken('test')
        structured_document = SimpleStructuredDocument(
            lines=[SimpleLine([token])])
        structured_document.set_tag(token, TAG1)

        blocks = annotation_document_page_to_annotation_blocks(
            structured_document,
            structured_document.get_pages()[0])
        assert len(blocks) == 0
Esempio n. 27
0
 def test_should_not_return_line_number_tokens_at_unusual_position2(self):
     number_tokens = flatten([[
         SimpleToken(str(line_no), dict(x=str(x * 50), y=str(line_no * 20)))
         for line_no in range(1, 5)
     ] for x in range(1, 3)])
     doc = SimpleStructuredDocument(lines=[
         SimpleLine([number_token]) for number_token in number_tokens
     ])
     actual_line_number_tokens = list(find_line_number_tokens(doc))
     assert actual_line_number_tokens == []
 def test_should_return_bounding_box(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([SimpleLine([
             SimpleToken(TEXT_1, bounding_box=TOKEN_BOUNDING_BOX)
         ])], bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(
         structured_document
     ))
     assert [t.get('bounding_box') for t in result] == [TOKEN_BOUNDING_BOX]
 def test_should_raise_error_if_token_props_do_not_match(self):
     token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX)
     structured_document = SimpleStructuredDocument(
         SimplePage(lines=[SimpleLine([token_1])],
                    bounding_box=BOUNDING_BOX))
     token_props_list = list(
         structured_document_to_token_props(structured_document))
     token_props_list[0]['text'] = TOKEN_TEXT_2
     with pytest.raises(AssertionError):
         annotate_structured_document_using_predictions(
             structured_document, [TAG_1], token_props_list)
Esempio n. 30
0
 def test_should_learn_simple_sequence(self):
     structured_document = SimpleStructuredDocument(
         SimplePage([
             SimpleLine([
                 SimpleToken(TEXT_1, tag=TAG_1),
                 SimpleToken(TEXT_2, tag=TAG_2),
                 SimpleToken(TEXT_3, tag=TAG_3)
             ])
         ],
                    bounding_box=PAGE_BOUNDING_BOX))
     token_props_list = list(
         structured_document_to_token_props(structured_document))
     get_logger().debug('token_props_list:\n%s', token_props_list)
     X = [token_props_list_to_features(token_props_list)]
     y = [token_props_list_to_labels(token_props_list)]
     get_logger().debug('X:\n%s', X)
     get_logger().debug('y:\n%s', y)
     with create_crf_suite_model() as model:
         model.fit(X, y)
         y_predicted = model.predict(X)
         assert y_predicted == y