def test_pseudonymise_file():
    identifying_keywords_for_pseudo = (
        pseudonymisation_api.get_default_pseudonymisation_keywords())
    logging.info("Using pseudonymisation keywords")
    replacement_strategy = pseudonymisation_api.pseudonymisation_dispatch
    logging.info("Using pseudonymisation strategy")
    for test_file_path in get_test_filepaths():
        _test_pseudonymise_file_at_path(
            test_file_path,
            test_identifying_keywords=identifying_keywords_for_pseudo,
            test_replacement_strategy=replacement_strategy,
        )
Beispiel #2
0
def _test_pseudonymise_file_at_path(
    test_file_path, test_identifying_keywords=None, test_replacement_strategy=None
):
    assert not is_anonymised_file(test_file_path)
    if test_identifying_keywords is None:
        identifying_keywords_for_pseudo = (
            pseudonymisation_api.get_default_pseudonymisation_keywords()
        )
        logging.info("Using pseudonymisation keywords")
    else:
        identifying_keywords_for_pseudo = test_identifying_keywords
    if test_replacement_strategy is None:
        replacement_strategy = pseudonymisation_api.pseudonymisation_dispatch
        logging.info("Using pseudonymisation strategy")
    else:
        replacement_strategy = test_replacement_strategy

    with tempfile.TemporaryDirectory() as output_directory:
        pseudonymised_file_path = anonymise_file(
            dicom_filepath=test_file_path,
            output_filepath=output_directory,
            delete_original_file=False,
            anonymise_filename=True,
            replace_values=True,
            # keywords_to_leave_unchanged=None,
            delete_private_tags=True,
            delete_unknown_tags=True,
            replacement_strategy=replacement_strategy,
            identifying_keywords=identifying_keywords_for_pseudo,
        )
        # debug print + Assert to force the print
        # print("Pseudonymised file at: ", pseudonymised_file_path)
        # assert False
        assert exists(pseudonymised_file_path)
        ds_input = pydicom.dcmread(test_file_path, force=True)
        ds_pseudo = pydicom.dcmread(pseudonymised_file_path, force=True)
        # simplistic stand-in to make sure *something* is happening
        assert ds_input["PatientID"].value != ds_pseudo["PatientID"].value
        # make sure that we are not accidentally using the hardcode replacement approach
        assert ds_pseudo["PatientID"].value not in ["", "Anonymous"]
Beispiel #3
0
def _zip_pseudo_fifty_mbytes(file_buffer_list: list,
                             zipfile_path: str,
                             zip_bytes_io=None):
    """Pseudonymises the contents of the file_buffer_list (list of DICOM files)
    and places the pseudonymised files in to a zip.

    Parameters
    ----------
    file_buffer_list : list
        List of DICOM file buffers from streamlit file_uploader to pseudonymise
    zipfile_path : str
        Location to write the zip file so that it can be downloaded.
        Basename provides default name to use for downloading
    zip_bytes_io : io.BytesIO, optional
        An in memory file like object to be used for storing the Zip instead of the
        zipfile_path.  Highly desirable because the zip written to zipfile_path can't
        be deleted from this module.

    """

    bad_data = False
    file_count = 0
    keywords = pseudonymisation_api.get_default_pseudonymisation_keywords()
    keywords.remove("PatientSex")
    strategy = pseudonymisation_api.pseudonymisation_dispatch
    zip_stream = zipfile_path
    if zip_bytes_io is not None:
        zip_stream = zip_bytes_io

    with ZipFile(zip_stream, mode="w", compression=ZIP_DEFLATED) as myzip:
        for uploaded_file_buffer in file_buffer_list:
            file_count += 1

            # don't close the buffers.  Streamlit provides the user with control over that.
            # might be appropriate to close the buffers in some circumstances,
            # but then when the user goes to close the buffer (click x on screen)
            # there will be an error.

            try:
                original_file_name = uploaded_file_buffer.name
                ds_input: pydicom.FileDataset = pydicom.dcmread(
                    uploaded_file_buffer, force=True)

                anonymise_dataset(
                    ds_input,
                    delete_private_tags=True,
                    delete_unknown_tags=True,
                    copy_dataset=
                    False,  # do the work in place.  less memory used and we're disposing shortly anyway
                    identifying_keywords=keywords,
                    replacement_strategy=strategy,
                )
                temp_anon_filepath = build_pseudonymised_file_name(ds_input)
                in_memory_temp_file = io.BytesIO()
                anon_filename = pathlib.Path(temp_anon_filepath).name
                pydicom.dcmwrite(in_memory_temp_file, ds_input)
            except (KeyError, IOError, ValueError) as e_info:
                print(e_info)
                print(f"While processing {original_file_name}")
                bad_data = True
                break
            myzip.writestr(
                anon_filename,
                in_memory_temp_file.getvalue(),
                compress_type=ZIP_DEFLATED,
            )
            in_memory_temp_file.close()
    return bad_data
Beispiel #4
0
def anonymize(path, Datasets, FilePaths, rawdvh):
    """
    Create an anonymised copy of an entire patient data set, including
    DICOM files,
    DHV CSV file,
    Clinical Data CSV file,
    Pyradiomics CSV file
    and place it in a subdirectory of the specified path

    Parameters
    ----------
    path: ``str``
        The current patient Directory.
        The anonymised data will be placed parallel to it, i.e. a child of the same parent directory

    Datasets: ``dict`` with values of ``pydicom.dataset.Dataset``
        The set of DICOM data for the patient to be anonymised

    Filepaths: ``list`` of ``string``
        The list of fully or partially qualified (relative to current working directory) filenames
        pointing to the patient's DICOM data

    rawdvh: ``dict`` with key = ROINumber, value = DVH
        a representation of the Dose Volume Histogram

    Returns
    -------
    Full_Patient_Path_New_folder: ``str``
        The fully qualified directory name where the anonymised data has been placed
    """

    all_filepaths = FilePaths
    new_dict_dataset = Datasets
    first_file_path = next(iter(all_filepaths.values()))
    first_dicom_object = next(iter(new_dict_dataset.values()))
    print("\n\nCurrent Work Directory is:  ==== ", os.getcwd())
    print("IN ANON===================")
    print("\n\n\n=====Path in ANONYMIZation   ===", path)
    # print("=====Datasets========= in ANONYMIZation   ===",Datasets)
    print("\n\n\n=====FilePaths in ANONYMIZation   ===")  # ), all_filepaths)
    for key, filepath in all_filepaths.items():
        print(key, ":", filepath)
    # print("The value for CT 0 is : ", new_dict_dataset[0])
    # for key in Datasets:
    #     if (key != 'rtplan' and key != 'rtss' and key != 'rtdose'):
    #         print("Values are:  ",Datasets[key])

    file_previously_anonymised = _file_previously_anonymised(first_file_path)

    Original_P_ID = first_dicom_object.PatientID

    patient_name_in_dataset = _trim_bracketing_single_quotes(
        first_dicom_object["PatientName"].repval
    )

    # get the pname_ID ("Patient Name  + PatientID") before any anonymisation
    # but there's no point in using it if the current data is already anonymised
    # the sha1_pname (md5 and sha1 hash) is no longer used directly,
    # instead use hashed_patient_id
    pname_ID, sha1_pname = _create_reidentification_item(
        first_dicom_object,
    )

    if not FEATURE_TOGGLE_PSEUDONYMISE:
        if not file_previously_anonymised:
            hashed_patient_id = _gen_md5_and_sha1_hash(Original_P_ID)
            hashed_patient_name = _gen_md5_and_sha1_hash(patient_name_in_dataset)
        else:
            hashed_patient_id = Original_P_ID
            hashed_patient_name = patient_name_in_dataset

        # this build up of the anonymisation folder is not needed yet
        # because _anon_call returns it, but I'd like to separate
        # the "anonymise the DICOM data" from "where does everything go"
        anonymised_patient_full_path = _build_anonymisation_folder_name(
            first_dicom_object, path, file_previously_anonymised
        )
        # _anon_call currently modifies the datasets in hand as part of anonymisation
        # if the data does not appear to have already been anonymised
        anonymised_patient_full_path = _anonymise_dicom_data(
            path, new_dict_dataset, all_filepaths
        )
    else:
        # not bothering to check if the data itself was already pseudonymised.
        # if it was, just  apply (another round of) pseudonymisation.
        hashed_patient_id = pseudonymise.pseudonymisation_dispatch["LO"](
            Original_P_ID
        ).replace("/", "")
        # hashed_patient_name = pseudonymise.pseudonymisation_dispatch["PN"](patient_name_in_dataset)
        # changing the approach a bit with pseudonymisation
        # instead of using a hash of the patient name for the directory, use the pseudonymised
        # patient id, which is pretty much just a sha3 based hash.
        # This will then be consistent with the naming of the CSV files, which are based
        # on the hashed/pseudonymised patient id...
        anonymised_patient_full_path = pathlib.Path(path).parent.joinpath(
            hashed_patient_id
        )

        os.makedirs(anonymised_patient_full_path, exist_ok=True)
        # workaround for pseudonymisation failing when faced with SQ that are identifiers.
        # it was designed to pseudonymise what is *in* a SQ.
        # identifying_keywords_less_sequences = [
        #     x
        #     for x in pseudonymise.get_default_pseudonymisation_keywords()
        #     if not x.endswith("Sequence")
        # ]
        for key, dicom_object_as_dataset in new_dict_dataset.items():
            # _workaround_hacks_for_pmp_pseudo(dicom_object_as_dataset)
            ds_pseudo = pmp_anonymise(
                dicom_object_as_dataset,
                # Leave PatientWeight and PatientSize unmodified per @AAM
                keywords_to_leave_unchanged=["PatientSex", "PatientWeight", "PatientSize"],
                replacement_strategy=pseudonymise.pseudonymisation_dispatch,
                identifying_keywords=pseudonymise.get_default_pseudonymisation_keywords(),
            )
            # PatientSex has specific values that are valid.
            # pseudonymisation doesn't handle that any better
            # than other anonymisation techniques.
            # above, it's left alone.  But it could be set to empty
            # or it could be set to O.
            # But clinically... the gender of the patient can be quite relevant
            # and if the organ involved or imaged is sex linked or sex influenced (breast, prostate,
            # ovary), "hiding" the gender in the metadata may not really prevent re-identification
            # of the gender/PatientSex
            ds_pseudo_full_path = create_filename_from_dataset(
                ds_pseudo, anonymised_patient_full_path
            )
            ds_pseudo.save_as(ds_pseudo_full_path)

    print("\n\nThe New patient folder path is : ", anonymised_patient_full_path)

    anonymisation_csv_full_path = pathlib.Path().joinpath(
        anonymised_patient_full_path, "CSV"
    )
    if not os.path.exists(anonymisation_csv_full_path):
        os.makedirs(anonymisation_csv_full_path)

    _export_anonymised_dvh_data(
        Original_P_ID, path, hashed_patient_id, anonymisation_csv_full_path
    )

    _export_anonymised_clinical_data(
        Original_P_ID, path, hashed_patient_id, anonymisation_csv_full_path
    )

    _export_anonymised_pyradiomics_data(
        Original_P_ID,
        path,
        hashed_patient_id,
        anonymisation_csv_full_path,
        export_nrrd_files=False,  # TODO: ask AAM if he wants the nrrd files themselves copied
    )

    if not file_previously_anonymised:
        csv_filename = "patientHash.csv"
        # store the the original vs. hashed values
        # appends if the re-identification spreadsheet is already present
        _create_reidentification_spreadsheet(pname_ID, hashed_patient_id, csv_filename)
        print("Updating patient re-identification spreadsheet")

    return str(anonymised_patient_full_path)