def test_extract_ocr_data_no_table_match(mocked_extract_fields,
                                         mocked_extract_table_data,
                                         mocked_get_table_rows):
    """
    Unit: tests the overall ocr data extraction function where there is
          no table_data match.
    """
    # mock results
    mocked_fields = {"field1": "result1", "field2": "result2"}

    mocked_table_data = {}

    mocked_extract_fields.return_value = mocked_fields
    mocked_extract_table_data.return_value = mocked_table_data

    expected_extracted_data = {
        "fields": mocked_fields,
        "table": mocked_table_data,
    }

    # method invocation
    assert (extract_ocr_data("ocr result",
                             {"test": "test"}) == expected_extracted_data)

    # mock assertions
    mocked_extract_fields.assert_called_once_with("ocr result",
                                                  {"test": "test"})

    mocked_extract_table_data.return_assert_called_once_with(
        "ocr result", {"test": "test"})

    # no row extraction if there is no table
    mocked_get_table_rows.assert_not_called()
def test_extract_ocr_data_uniqueness_not_found(
    mocked_extract_fields,
    mocked_extract_table_data,
    mocked_get_table_rows,
    mocked_extract_row_named_groups,
):
    """
    Unit: tests the overall ocr data extraction function where there is
          a table_data match and rows matches and there are uniqueness fields.
    """
    drm = {"test": "test", "uniqueness_fields": ["field99"]}

    # mock results (field1 is a uniquess field)
    mocked_fields = {"field1": "result1", "field2": "result2"}

    mocked_table_data = {
        "header": "test",
        "all_rows": "row1 row2 row3",
        "footer": "footer",
    }

    mocked_table_rows = ["row1", "row2", "row3"]
    mocked_row_named_groups = [
        {"row": "row1", "data": {"group_1": "value_1"}},
        {"row": "row2", "data": {"group_1": "value_1"}},
        {"row": "row3", "data": {"group_1": "value_1"}},
    ]

    mocked_extract_fields.return_value = mocked_fields
    mocked_extract_table_data.return_value = mocked_table_data
    mocked_get_table_rows.return_value = mocked_table_rows
    mocked_extract_row_named_groups.side_effect = mocked_row_named_groups

    expected_extracted_data = {
        "fields": mocked_fields,
        "table": mocked_table_data,
        "uniqueness_fields": {"field1": "result1"},
    }

    expected_extracted_data["table"]["rows"] = mocked_row_named_groups

    # method invocation
    assert extract_ocr_data("ocr result", drm) == {}  # uniqueness fields fail

    # mock assertions
    mocked_extract_fields.assert_called_once_with("ocr result", drm)

    mocked_extract_table_data.return_assert_called_once_with("ocr result", drm)

    mocked_get_table_rows.assert_called_once_with(
        mocked_table_data["all_rows"], drm
    )

    mocked_extract_row_named_groups.return_value.assert_has_calls = [
        mock.call("row1", drm),
        mock.call("row2", drm),
        mock.call("row3", drm),
    ]
Exemple #3
0
def parse_ocr_result(ocr_result, drms):
    """
    Parses and extract data from the OCR document result string by
    using a DRM (Document Regexp Model) that matches this OCR string.

    Args:
        ocr_result (str): OCR result string;
        drms (dict): list of all DRMs dicts found in the DRM directory folder.

    Returns:
        (dict): the extracted data from the OCR results.

    Example of the extracted data:
    
        {
            "fields": {
                "field1": "result1",
                "field2": "result2"
            },
            "table": {
                "header": "table header",
                "all_rows": "all rows together here...",
                "rows": [
                    "row 1 result",
                    "row 2 result",
                    ...
                ],
                "footer": "table footer"
            }
        }
    """
    logger.info("Verifying DRMs that match with this OCR document string...")
    drms = get_all_drms_match(ocr_result, drms)

    if not drms:
        logger.warning("No DRM matches this OCR result. Returning None...")

        return {}

    drm = drms[0]
    logger.info("Using the following DRM: %s", drm)

    logger.info("Pre processing the OCR result according to DRM...")
    pre_processed_result = pre_process_result(ocr_result, drm)
    logger.debug("Showing pre processed OCR result...\n%s",
                 pre_processed_result)

    logger.info("Extracting json data from the OCR pre processed result...")
    data = extract_ocr_data(pre_processed_result, drm)

    return data
def test_extract_ocr_data_table_match_rows_no_match(
    mocked_extract_fields, mocked_extract_table_data, mocked_get_table_rows
):
    """
    Unit: tests the overall ocr data extraction function where there is
          a table_data match but no row regexp matches.
    """
    # mock results
    mocked_fields = {"field1": "result1", "field2": "result2"}

    mocked_table_data = {
        "header": "test",
        "all_rows": "row1 row2 row3",
        "footer": "footer",
    }

    mocked_table_rows = []  # no row matches

    mocked_extract_fields.return_value = mocked_fields
    mocked_extract_table_data.return_value = mocked_table_data
    mocked_get_table_rows.return_value = mocked_table_rows

    expected_extracted_data = {
        "fields": mocked_fields,
        "table": mocked_table_data,
    }

    expected_extracted_data["table"]["rows"] = mocked_table_rows  # [] now

    # method invocation
    assert (
        extract_ocr_data("ocr result", {"test": "test"})
        == expected_extracted_data
    )

    # mock assertions
    mocked_extract_fields.assert_called_once_with(
        "ocr result", {"test": "test"}
    )

    mocked_extract_table_data.return_assert_called_once_with(
        "ocr result", {"test": "test"}
    )

    mocked_get_table_rows.assert_called_once_with(
        mocked_table_data["all_rows"], {"test": "test"}
    )