def process_pdf_bundle(file_bundles: List[FileBundle], fields_to_redact: List[str]): renderer = PdfRenderer() for file_bundle in file_bundles: pdf_pre_processing_bundle = PdfPreProcessingBundle.from_file_bundle( file_bundle) redacted_image_name = get_redacted_file_name( pdf_pre_processing_bundle.rendered_file_name) redacted_fott_name = get_redacted_file_name(file_bundle.fott_file_name) redacted_ocr_name = get_redacted_file_name(file_bundle.ocr_file_name) # Render PDF renderer.render_pdf_and_save( Path(build_pre_processing_folder, file_bundle.image_file_name), Path(build_pre_processing_folder, pdf_pre_processing_bundle.rendered_file_name), target_pdf_render_dpi) # Follow the regular redaction process with taking files from slightly different source folders redact_image( Path(build_pre_processing_folder, pdf_pre_processing_bundle.rendered_file_name), Path(build_pre_processing_folder, file_bundle.fott_file_name), Path(build_output_folder, redacted_image_name), fields_to_redact) redact_fott_label( Path(build_pre_processing_folder, file_bundle.fott_file_name), Path(build_output_folder, redacted_fott_name), fields_to_redact) redact_ocr_result( Path(build_pre_processing_folder, file_bundle.ocr_file_name), Path(build_pre_processing_folder, file_bundle.fott_file_name), Path(build_output_folder, redacted_ocr_name), fields_to_redact)
def test_redact_image_file(tempdir, capsys): test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png') output_filepath = os.path.join(tempdir, 'redacted.png') redact.redact_image(test_filepath, output_filepath) out, _ = capsys.readouterr() assert output_filepath in out
def test_redact_image_file(tempdir, capsys): test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png') output_filepath = os.path.join(tempdir, 'redacted.png') redact.redact_image(GCLOUD_PROJECT, test_filepath, output_filepath, ['FIRST_NAME', 'EMAIL_ADDRESS']) out, _ = capsys.readouterr() assert output_filepath in out
def test_redact_image_file(tempdir, capsys): test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") output_filepath = os.path.join(tempdir, "redacted.png") redact.redact_image(GCLOUD_PROJECT, test_filepath, output_filepath, ["FIRST_NAME", "EMAIL_ADDRESS"]) out, _ = capsys.readouterr() assert output_filepath in out
def test_redact_image_file_with_infotype(tempdir, capsys): test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png') output_filepath = os.path.join(tempdir, 'redacted_with_infotype.png') redact.redact_image(test_filepath, output_filepath, info_types=['EMAIL_ADDRESS', 'US_MALE_NAME']) out, _ = capsys.readouterr() assert output_filepath in out
def test_redact_image_file(tempdir, capsys): test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png') output_filepath = os.path.join(tempdir, 'redacted.png') redact.redact_image( GCLOUD_PROJECT, test_filepath, output_filepath, ['FIRST_NAME', 'EMAIL_ADDRESS']) out, _ = capsys.readouterr() assert output_filepath in out
pdf_file_bundle_list = reader.download_bundles( to=build_pre_processing_folder, mode=FileType.PDF_ONLY) file_bundle_list = reader.download_bundles(to=build_input_folder) else: reader = LocalReader(input_path) pdf_file_bundle_list = reader.copy_bundles( to=build_pre_processing_folder, mode=FileType.PDF_ONLY) file_bundle_list = reader.copy_bundles(to=build_input_folder) for fb in file_bundle_list: redacted_image_name = get_redacted_file_name(fb.image_file_name) redacted_fott_name = get_redacted_file_name(fb.fott_file_name) redacted_ocr_name = get_redacted_file_name(fb.ocr_file_name) redact_image(Path(build_input_folder, fb.image_file_name), Path(build_input_folder, fb.fott_file_name), Path(build_output_folder, redacted_image_name), fields_to_redact) redact_fott_label(Path(build_input_folder, fb.fott_file_name), Path(build_output_folder, redacted_fott_name), fields_to_redact) redact_ocr_result(Path(build_input_folder, fb.ocr_file_name), Path(build_input_folder, fb.fott_file_name), Path(build_output_folder, redacted_ocr_name), fields_to_redact) # Render and process PDF files if any if pdf_file_bundle_list is not None: process_pdf_bundle(pdf_file_bundle_list, fields_to_redact) if is_blob_url(output_container): writer = BlobWriter(output_container, output_path)
# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. See License.txt in the project # root for license information. import sys from redact import redact_image, redact_fott_label, redact_ocr_result if __name__ == '__main__': operator = sys.argv[1] if operator == 'image': labels_to_redact = [] if len(sys.argv) < 6 else sys.argv[5].split(',') redact_image(image_path=sys.argv[2], fott_label_path=sys.argv[3], output_path=sys.argv[4], labels_to_redact=labels_to_redact) elif operator == 'fott': labels_to_redact = [] if len(sys.argv) < 5 else sys.argv[4].split(',') redact_fott_label(fott_label_path=sys.argv[2], output_path=sys.argv[3], labels_to_redact=labels_to_redact) elif operator == 'ocr': labels_to_redact = [] if len(sys.argv) < 6 else sys.argv[5].split(',') redact_ocr_result(ocr_result_path=sys.argv[2], fott_label_path=sys.argv[3], output_path=sys.argv[4], labels_to_redact=labels_to_redact) else: