def test_extract_ocr_data_no_table_match(mocked_extract_fields, mocked_extract_table_data, mocked_get_table_rows): """ Unit: tests the overall ocr data extraction function where there is no table_data match. """ # mock results mocked_fields = {"field1": "result1", "field2": "result2"} mocked_table_data = {} mocked_extract_fields.return_value = mocked_fields mocked_extract_table_data.return_value = mocked_table_data expected_extracted_data = { "fields": mocked_fields, "table": mocked_table_data, } # method invocation assert (extract_ocr_data("ocr result", {"test": "test"}) == expected_extracted_data) # mock assertions mocked_extract_fields.assert_called_once_with("ocr result", {"test": "test"}) mocked_extract_table_data.return_assert_called_once_with( "ocr result", {"test": "test"}) # no row extraction if there is no table mocked_get_table_rows.assert_not_called()
def test_extract_ocr_data_uniqueness_not_found( mocked_extract_fields, mocked_extract_table_data, mocked_get_table_rows, mocked_extract_row_named_groups, ): """ Unit: tests the overall ocr data extraction function where there is a table_data match and rows matches and there are uniqueness fields. """ drm = {"test": "test", "uniqueness_fields": ["field99"]} # mock results (field1 is a uniquess field) mocked_fields = {"field1": "result1", "field2": "result2"} mocked_table_data = { "header": "test", "all_rows": "row1 row2 row3", "footer": "footer", } mocked_table_rows = ["row1", "row2", "row3"] mocked_row_named_groups = [ {"row": "row1", "data": {"group_1": "value_1"}}, {"row": "row2", "data": {"group_1": "value_1"}}, {"row": "row3", "data": {"group_1": "value_1"}}, ] mocked_extract_fields.return_value = mocked_fields mocked_extract_table_data.return_value = mocked_table_data mocked_get_table_rows.return_value = mocked_table_rows mocked_extract_row_named_groups.side_effect = mocked_row_named_groups expected_extracted_data = { "fields": mocked_fields, "table": mocked_table_data, "uniqueness_fields": {"field1": "result1"}, } expected_extracted_data["table"]["rows"] = mocked_row_named_groups # method invocation assert extract_ocr_data("ocr result", drm) == {} # uniqueness fields fail # mock assertions mocked_extract_fields.assert_called_once_with("ocr result", drm) mocked_extract_table_data.return_assert_called_once_with("ocr result", drm) mocked_get_table_rows.assert_called_once_with( mocked_table_data["all_rows"], drm ) mocked_extract_row_named_groups.return_value.assert_has_calls = [ mock.call("row1", drm), mock.call("row2", drm), mock.call("row3", drm), ]
def parse_ocr_result(ocr_result, drms): """ Parses and extract data from the OCR document result string by using a DRM (Document Regexp Model) that matches this OCR string. Args: ocr_result (str): OCR result string; drms (dict): list of all DRMs dicts found in the DRM directory folder. Returns: (dict): the extracted data from the OCR results. Example of the extracted data: { "fields": { "field1": "result1", "field2": "result2" }, "table": { "header": "table header", "all_rows": "all rows together here...", "rows": [ "row 1 result", "row 2 result", ... ], "footer": "table footer" } } """ logger.info("Verifying DRMs that match with this OCR document string...") drms = get_all_drms_match(ocr_result, drms) if not drms: logger.warning("No DRM matches this OCR result. Returning None...") return {} drm = drms[0] logger.info("Using the following DRM: %s", drm) logger.info("Pre processing the OCR result according to DRM...") pre_processed_result = pre_process_result(ocr_result, drm) logger.debug("Showing pre processed OCR result...\n%s", pre_processed_result) logger.info("Extracting json data from the OCR pre processed result...") data = extract_ocr_data(pre_processed_result, drm) return data
def test_extract_ocr_data_table_match_rows_no_match( mocked_extract_fields, mocked_extract_table_data, mocked_get_table_rows ): """ Unit: tests the overall ocr data extraction function where there is a table_data match but no row regexp matches. """ # mock results mocked_fields = {"field1": "result1", "field2": "result2"} mocked_table_data = { "header": "test", "all_rows": "row1 row2 row3", "footer": "footer", } mocked_table_rows = [] # no row matches mocked_extract_fields.return_value = mocked_fields mocked_extract_table_data.return_value = mocked_table_data mocked_get_table_rows.return_value = mocked_table_rows expected_extracted_data = { "fields": mocked_fields, "table": mocked_table_data, } expected_extracted_data["table"]["rows"] = mocked_table_rows # [] now # method invocation assert ( extract_ocr_data("ocr result", {"test": "test"}) == expected_extracted_data ) # mock assertions mocked_extract_fields.assert_called_once_with( "ocr result", {"test": "test"} ) mocked_extract_table_data.return_assert_called_once_with( "ocr result", {"test": "test"} ) mocked_get_table_rows.assert_called_once_with( mocked_table_data["all_rows"], {"test": "test"} )