Exemple #1
0
    def test_should_learn_similar_sequence(self):
        structured_document_train = SimpleStructuredDocument(
            SimplePage([
                SimpleLine([
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_2, tag=TAG_2),
                    SimpleToken(TEXT_3, tag=TAG_3)
                ])
            ],
                       bounding_box=PAGE_BOUNDING_BOX))
        structured_document_test = SimpleStructuredDocument(
            SimplePage([
                SimpleLine([
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_2, tag=TAG_2),
                    SimpleToken(TEXT_3, tag=TAG_3)
                ])
            ],
                       bounding_box=PAGE_BOUNDING_BOX))
        token_props_list_train = list(
            structured_document_to_token_props(structured_document_train))
        X_train = [token_props_list_to_features(token_props_list_train)]
        y_train = [token_props_list_to_labels(token_props_list_train)]

        token_props_list_test = list(
            structured_document_to_token_props(structured_document_test))
        X_test = [token_props_list_to_features(token_props_list_test)]
        y_test = [token_props_list_to_labels(token_props_list_test)]

        with create_crf_suite_model() as model:
            model.fit(X_train, y_train)
            y_predicted = model.predict(X_test)
            assert y_predicted == y_test
 def test_should_merge_single_token_and_add_prefix(self):
     merged_structured_document = SimpleStructuredDocument(lines=[SimpleLine([
         SimpleToken(TEXT_1, tag=TAG_1)
     ])])
     other_structured_document = SimpleStructuredDocument(lines=[SimpleLine([
         SimpleToken(TEXT_1, tag=TAG_2)
     ])])
     merged_structured_document.merge_with(
         other_structured_document,
         partial(
             merge_token_tag,
             target_scope=SCOPE_1
         )
     )
     merged_tokens = list(merged_structured_document.iter_all_tokens())
     assert (
         [merged_structured_document.get_text(t) for t in merged_tokens] ==
         [TEXT_1]
     )
     assert (
         [merged_structured_document.get_tag(t) for t in merged_tokens] ==
         [TAG_1]
     )
     assert (
         [merged_structured_document.get_tag(t, scope=SCOPE_1) for t in merged_tokens] ==
         [TAG_2]
     )
    def test_should_merge_doc_and_scope_cv_tag(
            self, load_structured_document_mock,
            structured_document_to_token_props_mock):

        structured_document = SimpleStructuredDocument(
            lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])])
        cv_structured_document = SimpleStructuredDocument(lines=[
            SimpleLine(
                [SimpleToken(TEXT_1, tag=TAG_2, tag_scope=CV_TAG_SCOPE)])
        ])
        load_structured_document_mock.side_effect = [
            structured_document, cv_structured_document
        ]
        load_and_convert_to_token_props(FILE_1,
                                        FILE_2,
                                        cv_source_tag_scope=CV_TAG_SCOPE,
                                        page_range=PAGE_RANGE)
        load_structured_document_mock.assert_any_call(FILE_1,
                                                      page_range=PAGE_RANGE)
        structured_document_arg = structured_document_to_token_props_mock.call_args[
            0][0]
        assert [
            structured_document_arg.get_tag_by_scope(t)
            for t in structured_document_arg.iter_all_tokens()
        ] == [{
            None: TAG_1,
            CV_TAG_SCOPE: TAG_2
        }]
Exemple #4
0
 def test_should_raise_assertion_error_if_tokens_mismatch(self):
     merged_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])])
     other_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_2, tag=TAG_2)])])
     with pytest.raises(AssertionError):
         merged_structured_document.merge_with(
             other_structured_document,
             partial(merge_token_tag, target_scope=SCOPE_1))
Exemple #5
0
 def test_should_not_override_with_empty_tags(self):
     merged_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])])
     other_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_1)])])
     merged_structured_document.merge_with(other_structured_document,
                                           partial(merge_token_tag))
     merged_tokens = list(merged_structured_document.iter_all_tokens())
     assert ([merged_structured_document.get_tag(t)
              for t in merged_tokens] == [TAG_1])
Exemple #6
0
 def test_should_return_line_token_index_and_page_count(self):
     structured_document = SimpleStructuredDocument([
         SimplePage(
             [SimpleLine([SimpleToken(TEXT_1),
                          SimpleToken(TEXT_2)])],
             bounding_box=PAGE_BOUNDING_BOX),
         SimplePage([SimpleLine([SimpleToken(TEXT_3)])],
                    bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(structured_document))
     line_tokens = [t.get('line_token') for t in result]
     assert [t.get('index') for t in line_tokens] == [0, 1, 0]
     assert [t.get('count') for t in line_tokens] == [2, 2, 1]
 def test_should_merge_from_cv_tag_scope(self):
     structured_document = SimpleStructuredDocument(lines=[SimpleLine([
         SimpleToken(TEXT_1, tag_scope=None, tag=TAG_1)
     ])])
     cv_structured_document = SimpleStructuredDocument(lines=[SimpleLine([
         SimpleToken(TEXT_1, tag_scope=CV_TAG_SCOPE, tag=TAG_2)
     ])])
     structured_document = merge_with_cv_structured_document(
         structured_document, cv_structured_document,
         cv_source_tag_scope=CV_TAG_SCOPE
     )
     assert get_all_token_tags(structured_document) == [TAG_1]
     assert get_all_token_tags(structured_document, scope=CV_TAG_SCOPE) == [TAG_2]
Exemple #8
0
 def test_should_not_fail_with_absent_tags(self):
     merged_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])])
     other_structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([SimpleToken(TEXT_1)])])
     merged_structured_document.merge_with(
         other_structured_document,
         partial(merge_token_tag, target_scope=SCOPE_1))
     merged_tokens = list(merged_structured_document.iter_all_tokens())
     assert ([
         merged_structured_document.get_tag(t, scope=SCOPE_1)
         for t in merged_tokens
     ] == [None])
Exemple #9
0
 def test_should_return_tag(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([
             SimpleLine(
                 [SimpleToken(TEXT_1, tag=TAG_1),
                  SimpleToken(TEXT_2)])
         ],
                    bounding_box=PAGE_BOUNDING_BOX),
         SimplePage([SimpleLine([SimpleToken(TEXT_3, tag=TAG_3)])],
                    bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(structured_document))
     assert [t.get('tag') for t in result] == [TAG_1, None, TAG_3]
 def test_should_return_scoped_tags(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([SimpleLine([
             SimpleToken(TEXT_1, tag=TAG_1),
             SimpleToken(TEXT_2)
         ])], bounding_box=PAGE_BOUNDING_BOX),
         SimplePage([SimpleLine([
             SimpleToken(TEXT_3, tag=TAG_3, tag_scope=SCOPE_1)
         ])], bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(
         structured_document
     ))
     assert [t.get('scoped_tags') for t in result] == [{}, {}, {SCOPE_1: TAG_3}]
 def test_should_match_case_insensitive(self):
     matching_tokens = _tokens_for_text('This Is Matching')
     target_annotations = [TargetAnnotation('tHIS iS mATCHING', TAG1)]
     doc = SimpleStructuredDocument(lines=[SimpleLine(matching_tokens)])
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
 def test_should_extract_sub_tags_from_single_item(self):
     tokens = [
         _token_with_sub_tag(VALUE_1,
                             tag=TAG_1,
                             tag_prefix=B_TAG_PREFIX,
                             sub_tag=TAG_2,
                             sub_tag_prefix=B_TAG_PREFIX),
         _token_with_sub_tag(VALUE_2,
                             tag=TAG_1,
                             tag_prefix=I_TAG_PREFIX,
                             sub_tag=TAG_2,
                             sub_tag_prefix=I_TAG_PREFIX),
         _token_with_sub_tag(VALUE_3,
                             tag=TAG_1,
                             tag_prefix=I_TAG_PREFIX,
                             sub_tag=TAG_3,
                             sub_tag_prefix=B_TAG_PREFIX)
     ]
     structured_document = SimpleStructuredDocument(
         lines=[SimpleLine(tokens)])
     extracted_items = list(
         extract_from_annotated_document(structured_document))
     result = [(x.tag, x.text, [(sub.tag, sub.text) for sub in x.sub_items])
               for x in extracted_items]
     get_logger().debug('result: %s', result)
     assert result == [(TAG_1, ' '.join([VALUE_1, VALUE_2, VALUE_3]),
                        [(TAG_2, ' '.join([VALUE_1, VALUE_2])),
                         (TAG_3, VALUE_3)])]
Exemple #13
0
 def test_should_tag_single_token_within_partial_prediction_at_smaller_scale(self):
     token_1 = SimpleToken(TOKEN_TEXT_1)
     structured_document = SimpleStructuredDocument(lines=[SimpleLine([token_1])])
     structured_document.set_bounding_box(
         structured_document.get_pages()[0],
         BoundingBox(0, 0, DEFAULT_WIDTH * 100, DEFAULT_HEIGHT * 100)
     )
     structured_document.set_bounding_box(
         token_1,
         BoundingBox(0, 0, DEFAULT_WIDTH * 10, DEFAULT_HEIGHT * 10)
     )
     annotated_image = filled_image(
         BG_COLOR, {TAG_1: COLOR_1},
         width=DEFAULT_WIDTH * 10,
         height=DEFAULT_HEIGHT * 10
     )
     fill_rect(
         annotated_image,
         BoundingBox(0, 0, DEFAULT_WIDTH, DEFAULT_HEIGHT),
         COLOR_1
     )
     annotate_structured_document_using_predicted_images(
         structured_document,
         [annotated_image]
     )
     assert structured_document.get_tag(token_1, scope=CV_TAG_SCOPE) == TAG_1
 def test_should_return_ratio_and_count_of_tagged_tokens(self):
     tagged_tokens = [
         SimpleToken('this'),
         SimpleToken('is'),
         SimpleToken('tagged')
     ]
     not_tagged_tokens = [
         SimpleToken('this'),
         SimpleToken('isn\'t')
     ]
     doc = SimpleStructuredDocument(lines=[SimpleLine(
         tagged_tokens + not_tagged_tokens
     )])
     for token in tagged_tokens:
         doc.set_tag(token, TAG1)
     num_total = len(tagged_tokens) + len(not_tagged_tokens)
     results = evaluate_document_by_page(doc)
     assert results == [{
         'count': {
             TAG1: len(tagged_tokens),
             None: len(not_tagged_tokens)
         },
         'percentage': {
             TAG1: len(tagged_tokens) / num_total,
             None: len(not_tagged_tokens) / num_total
         }
     }]
 def test_should_extract_from_different_tag_scope(self):
     lines = [
         SimpleLine([SimpleToken(TEXT_1, tag=TAG_1, tag_scope=TAG_SCOPE_1)])
     ]
     structured_document = SimpleStructuredDocument(lines=lines)
     result = [(x.tag, x.text) for x in extract_from_annotated_document(
         structured_document, tag_scope=TAG_SCOPE_1)]
     assert result == [(TAG_1, TEXT_1)]
def with_tag(x, tag):
    if isinstance(x, SimpleToken):
        x.set_tag(tag)
    elif isinstance(x, list):
        return [with_tag(y, tag) for y in x]
    elif isinstance(x, SimpleLine):
        return SimpleLine(with_tag(x.tokens, tag))
    return x
Exemple #17
0
 def test_should_return_all_tag_by_scope(self):
     token = SimpleToken(TEXT_1)
     doc = SimpleStructuredDocument(lines=[SimpleLine([token])])
     doc.set_tag(token, TAG_1)
     doc.set_tag(token, TAG_2, scope=SCOPE_1)
     assert doc.get_tag(token) == TAG_1
     assert doc.get_tag(token, scope=SCOPE_1) == TAG_2
     assert doc.get_tag_by_scope(token) == {None: TAG_1, SCOPE_1: TAG_2}
 def test_should_not_tag_using_none_tag(self):
     token_1 = SimpleToken(TOKEN_TEXT_1)
     structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([token_1])])
     annotate_structured_document_using_predictions(structured_document,
                                                    [NONE_TAG])
     assert structured_document.get_tag(token_1,
                                        scope=CRF_TAG_SCOPE) is None
Exemple #19
0
 def test_should_strip_prefix(self):
     tagged_tokens = [
         SimpleToken('this', tag=TAG1, tag_prefix=B_TAG_PREFIX),
         SimpleToken('is', tag=TAG1, tag_prefix=I_TAG_PREFIX),
         SimpleToken('tagged', tag=TAG1, tag_prefix=I_TAG_PREFIX)
     ]
     doc = SimpleStructuredDocument(lines=[SimpleLine(tagged_tokens)])
     results = evaluate_document_by_page(doc)
     assert set(results[0]['count'].keys()) == {TAG1}
Exemple #20
0
 def test_should_return_page_width_and_height(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([SimpleLine([SimpleToken(TEXT_1)])],
                    bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(structured_document))
     pages = [t.get('page') for t in result]
     assert [p.get('width') for p in pages] == [PAGE_BOUNDING_BOX.width]
     assert [p.get('height') for p in pages] == [PAGE_BOUNDING_BOX.height]
    def test_should_ignore_block_without_bounding_box(self):
        token = SimpleToken('test')
        structured_document = SimpleStructuredDocument(
            lines=[SimpleLine([token])])
        structured_document.set_tag(token, TAG1)

        blocks = annotation_document_page_to_annotation_blocks(
            structured_document,
            structured_document.get_pages()[0])
        assert len(blocks) == 0
 def test_should_not_tag_single_token_not_within_prediction(self):
     token_1 = SimpleToken(TOKEN_TEXT_1)
     structured_document = SimpleStructuredDocument(
         lines=[SimpleLine([token_1])])
     structured_document.set_bounding_box(
         structured_document.get_pages()[0], DEFAULT_BOUNDING_BOX)
     structured_document.set_bounding_box(token_1, DEFAULT_BOUNDING_BOX)
     annotate_structured_document_using_predicted_images(
         structured_document, [filled_image(BG_COLOR, {TAG_1: COLOR_1})])
     assert structured_document.get_tag(token_1, scope=CV_TAG_SCOPE) is None
 def test_should_return_single_token_text(self):
     structured_document = SimpleStructuredDocument(
         SimplePage([SimpleLine([
             SimpleToken(TEXT_1)
         ])], bounding_box=PAGE_BOUNDING_BOX)
     )
     result = list(structured_document_to_token_props(
         structured_document
     ))
     assert [t.get('text') for t in result] == [TEXT_1]
 def test_should_return_bounding_box(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([SimpleLine([
             SimpleToken(TEXT_1, bounding_box=TOKEN_BOUNDING_BOX)
         ])], bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(
         structured_document
     ))
     assert [t.get('bounding_box') for t in result] == [TOKEN_BOUNDING_BOX]
Exemple #25
0
 def test_should_not_return_line_number_tokens_at_unusual_position2(self):
     number_tokens = flatten([[
         SimpleToken(str(line_no), dict(x=str(x * 50), y=str(line_no * 20)))
         for line_no in range(1, 5)
     ] for x in range(1, 3)])
     doc = SimpleStructuredDocument(lines=[
         SimpleLine([number_token]) for number_token in number_tokens
     ])
     actual_line_number_tokens = list(find_line_number_tokens(doc))
     assert actual_line_number_tokens == []
 def test_should_raise_error_if_token_props_do_not_match(self):
     token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX)
     structured_document = SimpleStructuredDocument(
         SimplePage(lines=[SimpleLine([token_1])],
                    bounding_box=BOUNDING_BOX))
     token_props_list = list(
         structured_document_to_token_props(structured_document))
     token_props_list[0]['text'] = TOKEN_TEXT_2
     with pytest.raises(AssertionError):
         annotate_structured_document_using_predictions(
             structured_document, [TAG_1], token_props_list)
 def test_should_tag_single_token_using_prediction_and_check_token_props(
         self):
     token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX)
     structured_document = SimpleStructuredDocument(
         SimplePage(lines=[SimpleLine([token_1])],
                    bounding_box=BOUNDING_BOX))
     token_props_list = structured_document_to_token_props(
         structured_document)
     annotate_structured_document_using_predictions(structured_document,
                                                    [TAG_1],
                                                    token_props_list)
     assert structured_document.get_tag(token_1,
                                        scope=CRF_TAG_SCOPE) == TAG_1
    def test_should_strip_tag_prefix(self):
        token = SimpleToken('test',
                            tag=TAG1,
                            tag_prefix=B_TAG_PREFIX,
                            bounding_box=DEFAULT_BOUNDING_BOX)
        assert token.get_tag() == B_TAG_PREFIX + TAG1
        structured_document = SimpleStructuredDocument(
            lines=[SimpleLine([token])])

        blocks = annotation_document_page_to_annotation_blocks(
            structured_document,
            structured_document.get_pages()[0])
        assert [b.tag for b in blocks] == [TAG1]
 def test_should_predict_and_annotate_single_token(self):
     token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX)
     structured_document = SimpleStructuredDocument(
         SimplePage(lines=[SimpleLine([token_1])],
                    bounding_box=BOUNDING_BOX))
     model = MagicMock()
     model.predict.return_value = [[TAG_1]]
     token_props = list(
         structured_document_to_token_props(structured_document))
     X = [token_props_list_to_features(token_props)]
     predict_and_annotate_structured_document(structured_document, model)
     assert structured_document.get_tag(token_1,
                                        scope=CRF_TAG_SCOPE) == TAG_1
     model.predict.assert_called_with(X)
Exemple #30
0
 def test_should_not_return_line_number_tokens_if_not_line(self):
     line_number_tokens = [
         SimpleToken(str(line_no), dict(x=str(30), y=str(line_no * 20)))
         for line_no in range(1, 5)
     ]
     doc = SimpleStructuredDocument(lines=[
         SimpleLine([
             line_number_token,
             SimpleToken('other text',
                         dict(x=str(20), y=line_number_token.get_y()))
         ]) for line_number_token in line_number_tokens
     ])
     expected_line_number_tokens = []
     actual_line_number_tokens = list(find_line_number_tokens(doc))
     assert actual_line_number_tokens == expected_line_number_tokens