Esempio n. 1
0
def process_pdf_bundle(file_bundles: List[FileBundle],
                       fields_to_redact: List[str]):
    renderer = PdfRenderer()

    for file_bundle in file_bundles:
        pdf_pre_processing_bundle = PdfPreProcessingBundle.from_file_bundle(
            file_bundle)

        redacted_image_name = get_redacted_file_name(
            pdf_pre_processing_bundle.rendered_file_name)
        redacted_fott_name = get_redacted_file_name(file_bundle.fott_file_name)
        redacted_ocr_name = get_redacted_file_name(file_bundle.ocr_file_name)

        # Render PDF
        renderer.render_pdf_and_save(
            Path(build_pre_processing_folder, file_bundle.image_file_name),
            Path(build_pre_processing_folder,
                 pdf_pre_processing_bundle.rendered_file_name),
            target_pdf_render_dpi)

        # Follow the regular redaction process with taking files from slightly different source folders
        redact_image(
            Path(build_pre_processing_folder,
                 pdf_pre_processing_bundle.rendered_file_name),
            Path(build_pre_processing_folder, file_bundle.fott_file_name),
            Path(build_output_folder, redacted_image_name), fields_to_redact)
        redact_fott_label(
            Path(build_pre_processing_folder, file_bundle.fott_file_name),
            Path(build_output_folder, redacted_fott_name), fields_to_redact)
        redact_ocr_result(
            Path(build_pre_processing_folder, file_bundle.ocr_file_name),
            Path(build_pre_processing_folder, file_bundle.fott_file_name),
            Path(build_output_folder, redacted_ocr_name), fields_to_redact)
Esempio n. 2
0
def test_redact_image_file(tempdir, capsys):
    test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png')
    output_filepath = os.path.join(tempdir, 'redacted.png')

    redact.redact_image(test_filepath, output_filepath)

    out, _ = capsys.readouterr()
    assert output_filepath in out
Esempio n. 3
0
def test_redact_image_file(tempdir, capsys):
    test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png')
    output_filepath = os.path.join(tempdir, 'redacted.png')

    redact.redact_image(GCLOUD_PROJECT, test_filepath, output_filepath,
                        ['FIRST_NAME', 'EMAIL_ADDRESS'])

    out, _ = capsys.readouterr()
    assert output_filepath in out
Esempio n. 4
0
def test_redact_image_file(tempdir, capsys):
    test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
    output_filepath = os.path.join(tempdir, "redacted.png")

    redact.redact_image(GCLOUD_PROJECT, test_filepath, output_filepath,
                        ["FIRST_NAME", "EMAIL_ADDRESS"])

    out, _ = capsys.readouterr()
    assert output_filepath in out
Esempio n. 5
0
def test_redact_image_file_with_infotype(tempdir, capsys):
    test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png')
    output_filepath = os.path.join(tempdir, 'redacted_with_infotype.png')

    redact.redact_image(test_filepath,
                        output_filepath,
                        info_types=['EMAIL_ADDRESS', 'US_MALE_NAME'])

    out, _ = capsys.readouterr()
    assert output_filepath in out
def test_redact_image_file(tempdir, capsys):
    test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png')
    output_filepath = os.path.join(tempdir, 'redacted.png')

    redact.redact_image(
        GCLOUD_PROJECT,
        test_filepath,
        output_filepath,
        ['FIRST_NAME', 'EMAIL_ADDRESS'])

    out, _ = capsys.readouterr()
    assert output_filepath in out
Esempio n. 7
0
            pdf_file_bundle_list = reader.download_bundles(
                to=build_pre_processing_folder, mode=FileType.PDF_ONLY)
            file_bundle_list = reader.download_bundles(to=build_input_folder)
        else:
            reader = LocalReader(input_path)
            pdf_file_bundle_list = reader.copy_bundles(
                to=build_pre_processing_folder, mode=FileType.PDF_ONLY)
            file_bundle_list = reader.copy_bundles(to=build_input_folder)

        for fb in file_bundle_list:
            redacted_image_name = get_redacted_file_name(fb.image_file_name)
            redacted_fott_name = get_redacted_file_name(fb.fott_file_name)
            redacted_ocr_name = get_redacted_file_name(fb.ocr_file_name)

            redact_image(Path(build_input_folder, fb.image_file_name),
                         Path(build_input_folder, fb.fott_file_name),
                         Path(build_output_folder, redacted_image_name),
                         fields_to_redact)
            redact_fott_label(Path(build_input_folder, fb.fott_file_name),
                              Path(build_output_folder, redacted_fott_name),
                              fields_to_redact)
            redact_ocr_result(Path(build_input_folder, fb.ocr_file_name),
                              Path(build_input_folder, fb.fott_file_name),
                              Path(build_output_folder, redacted_ocr_name),
                              fields_to_redact)

        # Render and process PDF files if any
        if pdf_file_bundle_list is not None:
            process_pdf_bundle(pdf_file_bundle_list, fields_to_redact)

        if is_blob_url(output_container):
            writer = BlobWriter(output_container, output_path)
Esempio n. 8
0
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project
# root for license information.

import sys
from redact import redact_image, redact_fott_label, redact_ocr_result

if __name__ == '__main__':
    operator = sys.argv[1]

    if operator == 'image':
        labels_to_redact = [] if len(sys.argv) < 6 else sys.argv[5].split(',')
        redact_image(image_path=sys.argv[2],
                     fott_label_path=sys.argv[3],
                     output_path=sys.argv[4],
                     labels_to_redact=labels_to_redact)

    elif operator == 'fott':
        labels_to_redact = [] if len(sys.argv) < 5 else sys.argv[4].split(',')
        redact_fott_label(fott_label_path=sys.argv[2],
                          output_path=sys.argv[3],
                          labels_to_redact=labels_to_redact)

    elif operator == 'ocr':
        labels_to_redact = [] if len(sys.argv) < 6 else sys.argv[5].split(',')
        redact_ocr_result(ocr_result_path=sys.argv[2],
                          fott_label_path=sys.argv[3],
                          output_path=sys.argv[4],
                          labels_to_redact=labels_to_redact)

    else: