def test_pseudonymise_file(): identifying_keywords_for_pseudo = ( pseudonymisation_api.get_default_pseudonymisation_keywords()) logging.info("Using pseudonymisation keywords") replacement_strategy = pseudonymisation_api.pseudonymisation_dispatch logging.info("Using pseudonymisation strategy") for test_file_path in get_test_filepaths(): _test_pseudonymise_file_at_path( test_file_path, test_identifying_keywords=identifying_keywords_for_pseudo, test_replacement_strategy=replacement_strategy, )
def _test_pseudonymise_file_at_path( test_file_path, test_identifying_keywords=None, test_replacement_strategy=None ): assert not is_anonymised_file(test_file_path) if test_identifying_keywords is None: identifying_keywords_for_pseudo = ( pseudonymisation_api.get_default_pseudonymisation_keywords() ) logging.info("Using pseudonymisation keywords") else: identifying_keywords_for_pseudo = test_identifying_keywords if test_replacement_strategy is None: replacement_strategy = pseudonymisation_api.pseudonymisation_dispatch logging.info("Using pseudonymisation strategy") else: replacement_strategy = test_replacement_strategy with tempfile.TemporaryDirectory() as output_directory: pseudonymised_file_path = anonymise_file( dicom_filepath=test_file_path, output_filepath=output_directory, delete_original_file=False, anonymise_filename=True, replace_values=True, # keywords_to_leave_unchanged=None, delete_private_tags=True, delete_unknown_tags=True, replacement_strategy=replacement_strategy, identifying_keywords=identifying_keywords_for_pseudo, ) # debug print + Assert to force the print # print("Pseudonymised file at: ", pseudonymised_file_path) # assert False assert exists(pseudonymised_file_path) ds_input = pydicom.dcmread(test_file_path, force=True) ds_pseudo = pydicom.dcmread(pseudonymised_file_path, force=True) # simplistic stand-in to make sure *something* is happening assert ds_input["PatientID"].value != ds_pseudo["PatientID"].value # make sure that we are not accidentally using the hardcode replacement approach assert ds_pseudo["PatientID"].value not in ["", "Anonymous"]
def _zip_pseudo_fifty_mbytes(file_buffer_list: list, zipfile_path: str, zip_bytes_io=None): """Pseudonymises the contents of the file_buffer_list (list of DICOM files) and places the pseudonymised files in to a zip. Parameters ---------- file_buffer_list : list List of DICOM file buffers from streamlit file_uploader to pseudonymise zipfile_path : str Location to write the zip file so that it can be downloaded. Basename provides default name to use for downloading zip_bytes_io : io.BytesIO, optional An in memory file like object to be used for storing the Zip instead of the zipfile_path. Highly desirable because the zip written to zipfile_path can't be deleted from this module. """ bad_data = False file_count = 0 keywords = pseudonymisation_api.get_default_pseudonymisation_keywords() keywords.remove("PatientSex") strategy = pseudonymisation_api.pseudonymisation_dispatch zip_stream = zipfile_path if zip_bytes_io is not None: zip_stream = zip_bytes_io with ZipFile(zip_stream, mode="w", compression=ZIP_DEFLATED) as myzip: for uploaded_file_buffer in file_buffer_list: file_count += 1 # don't close the buffers. Streamlit provides the user with control over that. # might be appropriate to close the buffers in some circumstances, # but then when the user goes to close the buffer (click x on screen) # there will be an error. try: original_file_name = uploaded_file_buffer.name ds_input: pydicom.FileDataset = pydicom.dcmread( uploaded_file_buffer, force=True) anonymise_dataset( ds_input, delete_private_tags=True, delete_unknown_tags=True, copy_dataset= False, # do the work in place. less memory used and we're disposing shortly anyway identifying_keywords=keywords, replacement_strategy=strategy, ) temp_anon_filepath = build_pseudonymised_file_name(ds_input) in_memory_temp_file = io.BytesIO() anon_filename = pathlib.Path(temp_anon_filepath).name pydicom.dcmwrite(in_memory_temp_file, ds_input) except (KeyError, IOError, ValueError) as e_info: print(e_info) print(f"While processing {original_file_name}") bad_data = True break myzip.writestr( anon_filename, in_memory_temp_file.getvalue(), compress_type=ZIP_DEFLATED, ) in_memory_temp_file.close() return bad_data
def anonymize(path, Datasets, FilePaths, rawdvh): """ Create an anonymised copy of an entire patient data set, including DICOM files, DHV CSV file, Clinical Data CSV file, Pyradiomics CSV file and place it in a subdirectory of the specified path Parameters ---------- path: ``str`` The current patient Directory. The anonymised data will be placed parallel to it, i.e. a child of the same parent directory Datasets: ``dict`` with values of ``pydicom.dataset.Dataset`` The set of DICOM data for the patient to be anonymised Filepaths: ``list`` of ``string`` The list of fully or partially qualified (relative to current working directory) filenames pointing to the patient's DICOM data rawdvh: ``dict`` with key = ROINumber, value = DVH a representation of the Dose Volume Histogram Returns ------- Full_Patient_Path_New_folder: ``str`` The fully qualified directory name where the anonymised data has been placed """ all_filepaths = FilePaths new_dict_dataset = Datasets first_file_path = next(iter(all_filepaths.values())) first_dicom_object = next(iter(new_dict_dataset.values())) print("\n\nCurrent Work Directory is: ==== ", os.getcwd()) print("IN ANON===================") print("\n\n\n=====Path in ANONYMIZation ===", path) # print("=====Datasets========= in ANONYMIZation ===",Datasets) print("\n\n\n=====FilePaths in ANONYMIZation ===") # ), all_filepaths) for key, filepath in all_filepaths.items(): print(key, ":", filepath) # print("The value for CT 0 is : ", new_dict_dataset[0]) # for key in Datasets: # if (key != 'rtplan' and key != 'rtss' and key != 'rtdose'): # print("Values are: ",Datasets[key]) file_previously_anonymised = _file_previously_anonymised(first_file_path) Original_P_ID = first_dicom_object.PatientID patient_name_in_dataset = _trim_bracketing_single_quotes( first_dicom_object["PatientName"].repval ) # get the pname_ID ("Patient Name + PatientID") before any anonymisation # but there's no point in using it if the current data is already anonymised # the sha1_pname (md5 and sha1 hash) is no longer used directly, # instead use hashed_patient_id pname_ID, sha1_pname = _create_reidentification_item( first_dicom_object, ) if not FEATURE_TOGGLE_PSEUDONYMISE: if not file_previously_anonymised: hashed_patient_id = _gen_md5_and_sha1_hash(Original_P_ID) hashed_patient_name = _gen_md5_and_sha1_hash(patient_name_in_dataset) else: hashed_patient_id = Original_P_ID hashed_patient_name = patient_name_in_dataset # this build up of the anonymisation folder is not needed yet # because _anon_call returns it, but I'd like to separate # the "anonymise the DICOM data" from "where does everything go" anonymised_patient_full_path = _build_anonymisation_folder_name( first_dicom_object, path, file_previously_anonymised ) # _anon_call currently modifies the datasets in hand as part of anonymisation # if the data does not appear to have already been anonymised anonymised_patient_full_path = _anonymise_dicom_data( path, new_dict_dataset, all_filepaths ) else: # not bothering to check if the data itself was already pseudonymised. # if it was, just apply (another round of) pseudonymisation. hashed_patient_id = pseudonymise.pseudonymisation_dispatch["LO"]( Original_P_ID ).replace("/", "") # hashed_patient_name = pseudonymise.pseudonymisation_dispatch["PN"](patient_name_in_dataset) # changing the approach a bit with pseudonymisation # instead of using a hash of the patient name for the directory, use the pseudonymised # patient id, which is pretty much just a sha3 based hash. # This will then be consistent with the naming of the CSV files, which are based # on the hashed/pseudonymised patient id... anonymised_patient_full_path = pathlib.Path(path).parent.joinpath( hashed_patient_id ) os.makedirs(anonymised_patient_full_path, exist_ok=True) # workaround for pseudonymisation failing when faced with SQ that are identifiers. # it was designed to pseudonymise what is *in* a SQ. # identifying_keywords_less_sequences = [ # x # for x in pseudonymise.get_default_pseudonymisation_keywords() # if not x.endswith("Sequence") # ] for key, dicom_object_as_dataset in new_dict_dataset.items(): # _workaround_hacks_for_pmp_pseudo(dicom_object_as_dataset) ds_pseudo = pmp_anonymise( dicom_object_as_dataset, # Leave PatientWeight and PatientSize unmodified per @AAM keywords_to_leave_unchanged=["PatientSex", "PatientWeight", "PatientSize"], replacement_strategy=pseudonymise.pseudonymisation_dispatch, identifying_keywords=pseudonymise.get_default_pseudonymisation_keywords(), ) # PatientSex has specific values that are valid. # pseudonymisation doesn't handle that any better # than other anonymisation techniques. # above, it's left alone. But it could be set to empty # or it could be set to O. # But clinically... the gender of the patient can be quite relevant # and if the organ involved or imaged is sex linked or sex influenced (breast, prostate, # ovary), "hiding" the gender in the metadata may not really prevent re-identification # of the gender/PatientSex ds_pseudo_full_path = create_filename_from_dataset( ds_pseudo, anonymised_patient_full_path ) ds_pseudo.save_as(ds_pseudo_full_path) print("\n\nThe New patient folder path is : ", anonymised_patient_full_path) anonymisation_csv_full_path = pathlib.Path().joinpath( anonymised_patient_full_path, "CSV" ) if not os.path.exists(anonymisation_csv_full_path): os.makedirs(anonymisation_csv_full_path) _export_anonymised_dvh_data( Original_P_ID, path, hashed_patient_id, anonymisation_csv_full_path ) _export_anonymised_clinical_data( Original_P_ID, path, hashed_patient_id, anonymisation_csv_full_path ) _export_anonymised_pyradiomics_data( Original_P_ID, path, hashed_patient_id, anonymisation_csv_full_path, export_nrrd_files=False, # TODO: ask AAM if he wants the nrrd files themselves copied ) if not file_previously_anonymised: csv_filename = "patientHash.csv" # store the the original vs. hashed values # appends if the re-identification spreadsheet is already present _create_reidentification_spreadsheet(pname_ID, hashed_patient_id, csv_filename) print("Updating patient re-identification spreadsheet") return str(anonymised_patient_full_path)