def test_should_learn_similar_sequence(self): structured_document_train = SimpleStructuredDocument( SimplePage([ SimpleLine([ SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_2, tag=TAG_2), SimpleToken(TEXT_3, tag=TAG_3) ]) ], bounding_box=PAGE_BOUNDING_BOX)) structured_document_test = SimpleStructuredDocument( SimplePage([ SimpleLine([ SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_2, tag=TAG_2), SimpleToken(TEXT_3, tag=TAG_3) ]) ], bounding_box=PAGE_BOUNDING_BOX)) token_props_list_train = list( structured_document_to_token_props(structured_document_train)) X_train = [token_props_list_to_features(token_props_list_train)] y_train = [token_props_list_to_labels(token_props_list_train)] token_props_list_test = list( structured_document_to_token_props(structured_document_test)) X_test = [token_props_list_to_features(token_props_list_test)] y_test = [token_props_list_to_labels(token_props_list_test)] with create_crf_suite_model() as model: model.fit(X_train, y_train) y_predicted = model.predict(X_test) assert y_predicted == y_test
def test_should_merge_single_token_and_add_prefix(self): merged_structured_document = SimpleStructuredDocument(lines=[SimpleLine([ SimpleToken(TEXT_1, tag=TAG_1) ])]) other_structured_document = SimpleStructuredDocument(lines=[SimpleLine([ SimpleToken(TEXT_1, tag=TAG_2) ])]) merged_structured_document.merge_with( other_structured_document, partial( merge_token_tag, target_scope=SCOPE_1 ) ) merged_tokens = list(merged_structured_document.iter_all_tokens()) assert ( [merged_structured_document.get_text(t) for t in merged_tokens] == [TEXT_1] ) assert ( [merged_structured_document.get_tag(t) for t in merged_tokens] == [TAG_1] ) assert ( [merged_structured_document.get_tag(t, scope=SCOPE_1) for t in merged_tokens] == [TAG_2] )
def test_should_merge_doc_and_scope_cv_tag( self, load_structured_document_mock, structured_document_to_token_props_mock): structured_document = SimpleStructuredDocument( lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])]) cv_structured_document = SimpleStructuredDocument(lines=[ SimpleLine( [SimpleToken(TEXT_1, tag=TAG_2, tag_scope=CV_TAG_SCOPE)]) ]) load_structured_document_mock.side_effect = [ structured_document, cv_structured_document ] load_and_convert_to_token_props(FILE_1, FILE_2, cv_source_tag_scope=CV_TAG_SCOPE, page_range=PAGE_RANGE) load_structured_document_mock.assert_any_call(FILE_1, page_range=PAGE_RANGE) structured_document_arg = structured_document_to_token_props_mock.call_args[ 0][0] assert [ structured_document_arg.get_tag_by_scope(t) for t in structured_document_arg.iter_all_tokens() ] == [{ None: TAG_1, CV_TAG_SCOPE: TAG_2 }]
def test_should_raise_assertion_error_if_tokens_mismatch(self): merged_structured_document = SimpleStructuredDocument( lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])]) other_structured_document = SimpleStructuredDocument( lines=[SimpleLine([SimpleToken(TEXT_2, tag=TAG_2)])]) with pytest.raises(AssertionError): merged_structured_document.merge_with( other_structured_document, partial(merge_token_tag, target_scope=SCOPE_1))
def test_should_not_override_with_empty_tags(self): merged_structured_document = SimpleStructuredDocument( lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])]) other_structured_document = SimpleStructuredDocument( lines=[SimpleLine([SimpleToken(TEXT_1)])]) merged_structured_document.merge_with(other_structured_document, partial(merge_token_tag)) merged_tokens = list(merged_structured_document.iter_all_tokens()) assert ([merged_structured_document.get_tag(t) for t in merged_tokens] == [TAG_1])
def test_should_return_line_token_index_and_page_count(self): structured_document = SimpleStructuredDocument([ SimplePage( [SimpleLine([SimpleToken(TEXT_1), SimpleToken(TEXT_2)])], bounding_box=PAGE_BOUNDING_BOX), SimplePage([SimpleLine([SimpleToken(TEXT_3)])], bounding_box=PAGE_BOUNDING_BOX) ]) result = list(structured_document_to_token_props(structured_document)) line_tokens = [t.get('line_token') for t in result] assert [t.get('index') for t in line_tokens] == [0, 1, 0] assert [t.get('count') for t in line_tokens] == [2, 2, 1]
def test_should_merge_from_cv_tag_scope(self): structured_document = SimpleStructuredDocument(lines=[SimpleLine([ SimpleToken(TEXT_1, tag_scope=None, tag=TAG_1) ])]) cv_structured_document = SimpleStructuredDocument(lines=[SimpleLine([ SimpleToken(TEXT_1, tag_scope=CV_TAG_SCOPE, tag=TAG_2) ])]) structured_document = merge_with_cv_structured_document( structured_document, cv_structured_document, cv_source_tag_scope=CV_TAG_SCOPE ) assert get_all_token_tags(structured_document) == [TAG_1] assert get_all_token_tags(structured_document, scope=CV_TAG_SCOPE) == [TAG_2]
def test_should_not_fail_with_absent_tags(self): merged_structured_document = SimpleStructuredDocument( lines=[SimpleLine([SimpleToken(TEXT_1, tag=TAG_1)])]) other_structured_document = SimpleStructuredDocument( lines=[SimpleLine([SimpleToken(TEXT_1)])]) merged_structured_document.merge_with( other_structured_document, partial(merge_token_tag, target_scope=SCOPE_1)) merged_tokens = list(merged_structured_document.iter_all_tokens()) assert ([ merged_structured_document.get_tag(t, scope=SCOPE_1) for t in merged_tokens ] == [None])
def test_should_return_tag(self): structured_document = SimpleStructuredDocument([ SimplePage([ SimpleLine( [SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_2)]) ], bounding_box=PAGE_BOUNDING_BOX), SimplePage([SimpleLine([SimpleToken(TEXT_3, tag=TAG_3)])], bounding_box=PAGE_BOUNDING_BOX) ]) result = list(structured_document_to_token_props(structured_document)) assert [t.get('tag') for t in result] == [TAG_1, None, TAG_3]
def test_should_return_scoped_tags(self): structured_document = SimpleStructuredDocument([ SimplePage([SimpleLine([ SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_2) ])], bounding_box=PAGE_BOUNDING_BOX), SimplePage([SimpleLine([ SimpleToken(TEXT_3, tag=TAG_3, tag_scope=SCOPE_1) ])], bounding_box=PAGE_BOUNDING_BOX) ]) result = list(structured_document_to_token_props( structured_document )) assert [t.get('scoped_tags') for t in result] == [{}, {}, {SCOPE_1: TAG_3}]
def test_should_match_case_insensitive(self): matching_tokens = _tokens_for_text('This Is Matching') target_annotations = [TargetAnnotation('tHIS iS mATCHING', TAG1)] doc = SimpleStructuredDocument(lines=[SimpleLine(matching_tokens)]) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_extract_sub_tags_from_single_item(self): tokens = [ _token_with_sub_tag(VALUE_1, tag=TAG_1, tag_prefix=B_TAG_PREFIX, sub_tag=TAG_2, sub_tag_prefix=B_TAG_PREFIX), _token_with_sub_tag(VALUE_2, tag=TAG_1, tag_prefix=I_TAG_PREFIX, sub_tag=TAG_2, sub_tag_prefix=I_TAG_PREFIX), _token_with_sub_tag(VALUE_3, tag=TAG_1, tag_prefix=I_TAG_PREFIX, sub_tag=TAG_3, sub_tag_prefix=B_TAG_PREFIX) ] structured_document = SimpleStructuredDocument( lines=[SimpleLine(tokens)]) extracted_items = list( extract_from_annotated_document(structured_document)) result = [(x.tag, x.text, [(sub.tag, sub.text) for sub in x.sub_items]) for x in extracted_items] get_logger().debug('result: %s', result) assert result == [(TAG_1, ' '.join([VALUE_1, VALUE_2, VALUE_3]), [(TAG_2, ' '.join([VALUE_1, VALUE_2])), (TAG_3, VALUE_3)])]
def test_should_tag_single_token_within_partial_prediction_at_smaller_scale(self): token_1 = SimpleToken(TOKEN_TEXT_1) structured_document = SimpleStructuredDocument(lines=[SimpleLine([token_1])]) structured_document.set_bounding_box( structured_document.get_pages()[0], BoundingBox(0, 0, DEFAULT_WIDTH * 100, DEFAULT_HEIGHT * 100) ) structured_document.set_bounding_box( token_1, BoundingBox(0, 0, DEFAULT_WIDTH * 10, DEFAULT_HEIGHT * 10) ) annotated_image = filled_image( BG_COLOR, {TAG_1: COLOR_1}, width=DEFAULT_WIDTH * 10, height=DEFAULT_HEIGHT * 10 ) fill_rect( annotated_image, BoundingBox(0, 0, DEFAULT_WIDTH, DEFAULT_HEIGHT), COLOR_1 ) annotate_structured_document_using_predicted_images( structured_document, [annotated_image] ) assert structured_document.get_tag(token_1, scope=CV_TAG_SCOPE) == TAG_1
def test_should_return_ratio_and_count_of_tagged_tokens(self): tagged_tokens = [ SimpleToken('this'), SimpleToken('is'), SimpleToken('tagged') ] not_tagged_tokens = [ SimpleToken('this'), SimpleToken('isn\'t') ] doc = SimpleStructuredDocument(lines=[SimpleLine( tagged_tokens + not_tagged_tokens )]) for token in tagged_tokens: doc.set_tag(token, TAG1) num_total = len(tagged_tokens) + len(not_tagged_tokens) results = evaluate_document_by_page(doc) assert results == [{ 'count': { TAG1: len(tagged_tokens), None: len(not_tagged_tokens) }, 'percentage': { TAG1: len(tagged_tokens) / num_total, None: len(not_tagged_tokens) / num_total } }]
def test_should_extract_from_different_tag_scope(self): lines = [ SimpleLine([SimpleToken(TEXT_1, tag=TAG_1, tag_scope=TAG_SCOPE_1)]) ] structured_document = SimpleStructuredDocument(lines=lines) result = [(x.tag, x.text) for x in extract_from_annotated_document( structured_document, tag_scope=TAG_SCOPE_1)] assert result == [(TAG_1, TEXT_1)]
def with_tag(x, tag): if isinstance(x, SimpleToken): x.set_tag(tag) elif isinstance(x, list): return [with_tag(y, tag) for y in x] elif isinstance(x, SimpleLine): return SimpleLine(with_tag(x.tokens, tag)) return x
def test_should_return_all_tag_by_scope(self): token = SimpleToken(TEXT_1) doc = SimpleStructuredDocument(lines=[SimpleLine([token])]) doc.set_tag(token, TAG_1) doc.set_tag(token, TAG_2, scope=SCOPE_1) assert doc.get_tag(token) == TAG_1 assert doc.get_tag(token, scope=SCOPE_1) == TAG_2 assert doc.get_tag_by_scope(token) == {None: TAG_1, SCOPE_1: TAG_2}
def test_should_not_tag_using_none_tag(self): token_1 = SimpleToken(TOKEN_TEXT_1) structured_document = SimpleStructuredDocument( lines=[SimpleLine([token_1])]) annotate_structured_document_using_predictions(structured_document, [NONE_TAG]) assert structured_document.get_tag(token_1, scope=CRF_TAG_SCOPE) is None
def test_should_strip_prefix(self): tagged_tokens = [ SimpleToken('this', tag=TAG1, tag_prefix=B_TAG_PREFIX), SimpleToken('is', tag=TAG1, tag_prefix=I_TAG_PREFIX), SimpleToken('tagged', tag=TAG1, tag_prefix=I_TAG_PREFIX) ] doc = SimpleStructuredDocument(lines=[SimpleLine(tagged_tokens)]) results = evaluate_document_by_page(doc) assert set(results[0]['count'].keys()) == {TAG1}
def test_should_return_page_width_and_height(self): structured_document = SimpleStructuredDocument([ SimplePage([SimpleLine([SimpleToken(TEXT_1)])], bounding_box=PAGE_BOUNDING_BOX) ]) result = list(structured_document_to_token_props(structured_document)) pages = [t.get('page') for t in result] assert [p.get('width') for p in pages] == [PAGE_BOUNDING_BOX.width] assert [p.get('height') for p in pages] == [PAGE_BOUNDING_BOX.height]
def test_should_ignore_block_without_bounding_box(self): token = SimpleToken('test') structured_document = SimpleStructuredDocument( lines=[SimpleLine([token])]) structured_document.set_tag(token, TAG1) blocks = annotation_document_page_to_annotation_blocks( structured_document, structured_document.get_pages()[0]) assert len(blocks) == 0
def test_should_not_tag_single_token_not_within_prediction(self): token_1 = SimpleToken(TOKEN_TEXT_1) structured_document = SimpleStructuredDocument( lines=[SimpleLine([token_1])]) structured_document.set_bounding_box( structured_document.get_pages()[0], DEFAULT_BOUNDING_BOX) structured_document.set_bounding_box(token_1, DEFAULT_BOUNDING_BOX) annotate_structured_document_using_predicted_images( structured_document, [filled_image(BG_COLOR, {TAG_1: COLOR_1})]) assert structured_document.get_tag(token_1, scope=CV_TAG_SCOPE) is None
def test_should_return_single_token_text(self): structured_document = SimpleStructuredDocument( SimplePage([SimpleLine([ SimpleToken(TEXT_1) ])], bounding_box=PAGE_BOUNDING_BOX) ) result = list(structured_document_to_token_props( structured_document )) assert [t.get('text') for t in result] == [TEXT_1]
def test_should_return_bounding_box(self): structured_document = SimpleStructuredDocument([ SimplePage([SimpleLine([ SimpleToken(TEXT_1, bounding_box=TOKEN_BOUNDING_BOX) ])], bounding_box=PAGE_BOUNDING_BOX) ]) result = list(structured_document_to_token_props( structured_document )) assert [t.get('bounding_box') for t in result] == [TOKEN_BOUNDING_BOX]
def test_should_not_return_line_number_tokens_at_unusual_position2(self): number_tokens = flatten([[ SimpleToken(str(line_no), dict(x=str(x * 50), y=str(line_no * 20))) for line_no in range(1, 5) ] for x in range(1, 3)]) doc = SimpleStructuredDocument(lines=[ SimpleLine([number_token]) for number_token in number_tokens ]) actual_line_number_tokens = list(find_line_number_tokens(doc)) assert actual_line_number_tokens == []
def test_should_raise_error_if_token_props_do_not_match(self): token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX) structured_document = SimpleStructuredDocument( SimplePage(lines=[SimpleLine([token_1])], bounding_box=BOUNDING_BOX)) token_props_list = list( structured_document_to_token_props(structured_document)) token_props_list[0]['text'] = TOKEN_TEXT_2 with pytest.raises(AssertionError): annotate_structured_document_using_predictions( structured_document, [TAG_1], token_props_list)
def test_should_tag_single_token_using_prediction_and_check_token_props( self): token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX) structured_document = SimpleStructuredDocument( SimplePage(lines=[SimpleLine([token_1])], bounding_box=BOUNDING_BOX)) token_props_list = structured_document_to_token_props( structured_document) annotate_structured_document_using_predictions(structured_document, [TAG_1], token_props_list) assert structured_document.get_tag(token_1, scope=CRF_TAG_SCOPE) == TAG_1
def test_should_strip_tag_prefix(self): token = SimpleToken('test', tag=TAG1, tag_prefix=B_TAG_PREFIX, bounding_box=DEFAULT_BOUNDING_BOX) assert token.get_tag() == B_TAG_PREFIX + TAG1 structured_document = SimpleStructuredDocument( lines=[SimpleLine([token])]) blocks = annotation_document_page_to_annotation_blocks( structured_document, structured_document.get_pages()[0]) assert [b.tag for b in blocks] == [TAG1]
def test_should_predict_and_annotate_single_token(self): token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX) structured_document = SimpleStructuredDocument( SimplePage(lines=[SimpleLine([token_1])], bounding_box=BOUNDING_BOX)) model = MagicMock() model.predict.return_value = [[TAG_1]] token_props = list( structured_document_to_token_props(structured_document)) X = [token_props_list_to_features(token_props)] predict_and_annotate_structured_document(structured_document, model) assert structured_document.get_tag(token_1, scope=CRF_TAG_SCOPE) == TAG_1 model.predict.assert_called_with(X)
def test_should_not_return_line_number_tokens_if_not_line(self): line_number_tokens = [ SimpleToken(str(line_no), dict(x=str(30), y=str(line_no * 20))) for line_no in range(1, 5) ] doc = SimpleStructuredDocument(lines=[ SimpleLine([ line_number_token, SimpleToken('other text', dict(x=str(20), y=line_number_token.get_y())) ]) for line_number_token in line_number_tokens ]) expected_line_number_tokens = [] actual_line_number_tokens = list(find_line_number_tokens(doc)) assert actual_line_number_tokens == expected_line_number_tokens