def test_should_learn_similar_sequence(self): structured_document_train = SimpleStructuredDocument( SimplePage([ SimpleLine([ SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_2, tag=TAG_2), SimpleToken(TEXT_3, tag=TAG_3) ]) ], bounding_box=PAGE_BOUNDING_BOX)) structured_document_test = SimpleStructuredDocument( SimplePage([ SimpleLine([ SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_2, tag=TAG_2), SimpleToken(TEXT_3, tag=TAG_3) ]) ], bounding_box=PAGE_BOUNDING_BOX)) token_props_list_train = list( structured_document_to_token_props(structured_document_train)) X_train = [token_props_list_to_features(token_props_list_train)] y_train = [token_props_list_to_labels(token_props_list_train)] token_props_list_test = list( structured_document_to_token_props(structured_document_test)) X_test = [token_props_list_to_features(token_props_list_test)] y_test = [token_props_list_to_labels(token_props_list_test)] with create_crf_suite_model() as model: model.fit(X_train, y_train) y_predicted = model.predict(X_test) assert y_predicted == y_test
def test_should_return_empty_token_list_if_document_has_no_lines(self): structured_document = SimpleStructuredDocument( SimplePage([], bounding_box=PAGE_BOUNDING_BOX) ) assert list(structured_document_to_token_props( structured_document )) == []
def test_should_return_page_width_and_height(self): structured_document = SimpleStructuredDocument([ SimplePage([SimpleLine([SimpleToken(TEXT_1)])], bounding_box=PAGE_BOUNDING_BOX) ]) result = list(structured_document_to_token_props(structured_document)) pages = [t.get('page') for t in result] assert [p.get('width') for p in pages] == [PAGE_BOUNDING_BOX.width] assert [p.get('height') for p in pages] == [PAGE_BOUNDING_BOX.height]
def test_should_return_single_token_text(self): structured_document = SimpleStructuredDocument( SimplePage([SimpleLine([ SimpleToken(TEXT_1) ])], bounding_box=PAGE_BOUNDING_BOX) ) result = list(structured_document_to_token_props( structured_document )) assert [t.get('text') for t in result] == [TEXT_1]
def test_should_return_bounding_box(self): structured_document = SimpleStructuredDocument([ SimplePage([SimpleLine([ SimpleToken(TEXT_1, bounding_box=TOKEN_BOUNDING_BOX) ])], bounding_box=PAGE_BOUNDING_BOX) ]) result = list(structured_document_to_token_props( structured_document )) assert [t.get('bounding_box') for t in result] == [TOKEN_BOUNDING_BOX]
def test_should_raise_error_if_token_props_do_not_match(self): token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX) structured_document = SimpleStructuredDocument( SimplePage(lines=[SimpleLine([token_1])], bounding_box=BOUNDING_BOX)) token_props_list = list( structured_document_to_token_props(structured_document)) token_props_list[0]['text'] = TOKEN_TEXT_2 with pytest.raises(AssertionError): annotate_structured_document_using_predictions( structured_document, [TAG_1], token_props_list)
def predict_and_annotate_structured_document(structured_document, model, tag_scope=CRF_TAG_SCOPE): token_props = list(structured_document_to_token_props(structured_document)) x = token_props_list_to_features(token_props) y_pred = model.predict([x])[0] annotate_structured_document_using_predictions(structured_document, y_pred, token_props, tag_scope=tag_scope) return structured_document
def test_should_tag_single_token_using_prediction_and_check_token_props( self): token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX) structured_document = SimpleStructuredDocument( SimplePage(lines=[SimpleLine([token_1])], bounding_box=BOUNDING_BOX)) token_props_list = structured_document_to_token_props( structured_document) annotate_structured_document_using_predictions(structured_document, [TAG_1], token_props_list) assert structured_document.get_tag(token_1, scope=CRF_TAG_SCOPE) == TAG_1
def test_should_return_tag(self): structured_document = SimpleStructuredDocument([ SimplePage([ SimpleLine( [SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_2)]) ], bounding_box=PAGE_BOUNDING_BOX), SimplePage([SimpleLine([SimpleToken(TEXT_3, tag=TAG_3)])], bounding_box=PAGE_BOUNDING_BOX) ]) result = list(structured_document_to_token_props(structured_document)) assert [t.get('tag') for t in result] == [TAG_1, None, TAG_3]
def test_should_return_line_token_index_and_page_count(self): structured_document = SimpleStructuredDocument([ SimplePage( [SimpleLine([SimpleToken(TEXT_1), SimpleToken(TEXT_2)])], bounding_box=PAGE_BOUNDING_BOX), SimplePage([SimpleLine([SimpleToken(TEXT_3)])], bounding_box=PAGE_BOUNDING_BOX) ]) result = list(structured_document_to_token_props(structured_document)) line_tokens = [t.get('line_token') for t in result] assert [t.get('index') for t in line_tokens] == [0, 1, 0] assert [t.get('count') for t in line_tokens] == [2, 2, 1]
def test_should_predict_and_annotate_single_token(self): token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX) structured_document = SimpleStructuredDocument( SimplePage(lines=[SimpleLine([token_1])], bounding_box=BOUNDING_BOX)) model = MagicMock() model.predict.return_value = [[TAG_1]] token_props = list( structured_document_to_token_props(structured_document)) X = [token_props_list_to_features(token_props)] predict_and_annotate_structured_document(structured_document, model) assert structured_document.get_tag(token_1, scope=CRF_TAG_SCOPE) == TAG_1 model.predict.assert_called_with(X)
def test_should_return_scoped_tags(self): structured_document = SimpleStructuredDocument([ SimplePage([SimpleLine([ SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_2) ])], bounding_box=PAGE_BOUNDING_BOX), SimplePage([SimpleLine([ SimpleToken(TEXT_3, tag=TAG_3, tag_scope=SCOPE_1) ])], bounding_box=PAGE_BOUNDING_BOX) ]) result = list(structured_document_to_token_props( structured_document )) assert [t.get('scoped_tags') for t in result] == [{}, {}, {SCOPE_1: TAG_3}]
def load_and_convert_to_token_props(filename, cv_filename, cv_source_tag_scope, page_range=None): try: structured_document = load_structured_document(filename, page_range=page_range) if cv_filename: cv_structured_document = load_structured_document( cv_filename, page_range=page_range) structured_document = merge_with_cv_structured_document( structured_document, cv_structured_document, cv_source_tag_scope=cv_source_tag_scope) return list(structured_document_to_token_props(structured_document)) except StandardError as e: raise_from( RuntimeError('failed to process %s (due to %s: %s)' % (filename, type(e), e)), e)
def test_should_learn_simple_sequence(self): structured_document = SimpleStructuredDocument( SimplePage([ SimpleLine([ SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_2, tag=TAG_2), SimpleToken(TEXT_3, tag=TAG_3) ]) ], bounding_box=PAGE_BOUNDING_BOX)) token_props_list = list( structured_document_to_token_props(structured_document)) get_logger().debug('token_props_list:\n%s', token_props_list) X = [token_props_list_to_features(token_props_list)] y = [token_props_list_to_labels(token_props_list)] get_logger().debug('X:\n%s', X) get_logger().debug('y:\n%s', y) with create_crf_suite_model() as model: model.fit(X, y) y_predicted = model.predict(X) assert y_predicted == y
def test_should_pickle_and_unpickle_model(self): structured_document = SimpleStructuredDocument( SimplePage([ SimpleLine([ SimpleToken(TEXT_1, tag=TAG_1), SimpleToken(TEXT_2, tag=TAG_2), SimpleToken(TEXT_3, tag=TAG_3) ]) ], bounding_box=PAGE_BOUNDING_BOX)) token_props_list = list( structured_document_to_token_props(structured_document)) X = [token_props_list_to_features(token_props_list)] y = [token_props_list_to_labels(token_props_list)] with create_crf_suite_model() as model: model.fit(X, y) serialized_model = pickle.dumps(model) model = pickle.loads(serialized_model) y_predicted = model.predict(X) assert y_predicted == y
def test_should_return_empty_token_list_if_document_has_no_pages(self): structured_document = SimpleStructuredDocument([]) assert list(structured_document_to_token_props( structured_document )) == []