def deidentify_with_redact(self, input_str, info_types): project = self.project_id # Instantiate a client dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = f"projects/{project}" # Construct inspect configuration dictionary inspect_config = {"info_types": [ {"name": info_type} for info_type in info_types]} # Construct deidentify configuration dictionary deidentify_config = { "info_type_transformations": { "transformations": [{"primitive_transformation": {"redact_config": {}}}] } } # Construct item item = {"value": input_str} # Call the API response = dlp.deidentify_content( request={ "parent": parent, "deidentify_config": deidentify_config, "inspect_config": inspect_config, "item": item, } ) # Print out the results. return response.item.value
def process(self, elem): import google.cloud.dlp dlp = google.cloud.dlp_v2.DlpServiceClient() data = {"header": ["EMAIL"]} headers = [{"name": val} for val in data["header"]] rows = [] for event in elem: rows.append({"values": [{"string_value": str(event.email)}]}) table = {} table["headers"] = headers table["rows"] = rows item = {"table": table} parent = dlp.project_path(self.project_id) deidentify_template_name = "projects/{}/deidentifyTemplates/{}".format( self.project_id, self.template_id.get()) response = dlp.deidentify_content( parent, deidentify_template_name=deidentify_template_name, item=item) i = 0 for event in elem: user = User(event.full_name, response.item.table.rows[i].values[0].string_value, event.job, event.city) i = i + 1 yield user.obj_to_Row()
def deidentify_with_replace( project, input_str, info_types, replacement_str="REPLACEMENT_STR", ): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by replacing matched input values with a value you specify. Args: project: The Google Cloud project id to use as a parent resource. input_str: The string to deidentify (will be treated as text). info_types: A list of strings representing info types to look for. replacement_str: The string to replace all values that match given info types. Returns: None; the response from the API is printed to the terminal. """ import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Construct inspect configuration dictionary inspect_config = { "info_types": [{ "name": info_type } for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { "info_type_transformations": { "transformations": [{ "primitive_transformation": { "replace_config": { "new_value": { "string_value": replacement_str, } } } }] } } # Construct item item = {"value": input_str} # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item, ) # Print out the results. print(response.item.value)
def deidentify_with_mask( project, input_str, info_types, masking_character=None, number_to_mask=0 ): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. Args: project: The Google Cloud project id to use as a parent resource. input_str: The string to deidentify (will be treated as text). masking_character: The character to mask matching sensitive data with. number_to_mask: The maximum number of sensitive characters to mask in a match. If omitted or set to zero, the API will default to no maximum. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = f"projects/{project}" # Construct inspect configuration dictionary inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} # Construct deidentify configuration dictionary deidentify_config = { "info_type_transformations": { "transformations": [ { "primitive_transformation": { "character_mask_config": { "masking_character": masking_character, "number_to_mask": number_to_mask, } } } ] } } # Construct item item = {"value": input_str} # Call the API response = dlp.deidentify_content( request={ "parent": parent, "deidentify_config": deidentify_config, "inspect_config": inspect_config, "item": item, } ) # Print out the results. print(response.item.value)
def deidentify_with_mask(project, string, info_types, masking_character=None, number_to_mask=0): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. Args: project: The Google Cloud project id to use as a parent resource. item: The string to deidentify (will be treated as text). masking_character: The character to mask matching sensitive data with. number_to_mask: The maximum number of sensitive characters to mask in a match. If omitted or set to zero, the API will default to no maximum. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Construct inspect configuration dictionary inspect_config = { 'info_types': [{ 'name': info_type } for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { 'transformations': [{ 'primitive_transformation': { 'character_mask_config': { 'masking_character': masking_character, 'number_to_mask': number_to_mask } } }] } } # Construct item item = {'value': string} # Call the API response = dlp.deidentify_content(parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item) # Print out the results. print(response.item.value)
def deidentify_with_mask(project, string, info_types, masking_character=None, number_to_mask=0): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. Args: project: The Google Cloud project id to use as a parent resource. item: The string to deidentify (will be treated as text). masking_character: The character to mask matching sensitive data with. number_to_mask: The maximum number of sensitive characters to mask in a match. If omitted or set to zero, the API will default to no maximum. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Construct inspect configuration dictionary inspect_config = { 'info_types': [{'name': info_type} for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { 'transformations': [ { 'primitive_transformation': { 'character_mask_config': { 'masking_character': masking_character, 'number_to_mask': number_to_mask } } } ] } } # Construct item item = {'value': string} # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item) # Print out the results. print(response.item.value)
def deidentify_with_redact( project, input_str, info_types, ): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by redacting matched input values. Args: project: The Google Cloud project id to use as a parent resource. input_str: The string to deidentify (will be treated as text). info_types: A list of strings representing info types to look for. Returns: None; the response from the API is printed to the terminal. """ import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = f"projects/{project}" # Construct inspect configuration dictionary inspect_config = { "info_types": [{ "name": info_type } for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { "info_type_transformations": { "transformations": [{ "primitive_transformation": { "redact_config": {} } }] } } # Construct item item = {"value": input_str} # Call the API response = dlp.deidentify_content( request={ "parent": parent, "deidentify_config": deidentify_config, "inspect_config": inspect_config, "item": item, }) # Print out the results. print(response.item.value)
def deindentify(self, data, project_id, template_id): headers = [{"name": val} for val in data["header"]] rows = [] for row in data["rows"]: rows.append( {"values": [{"string_value": cell_val} for cell_val in row]} ) table = {} table["headers"] = headers table["rows"] = rows item = {"table": table} dlp = google.cloud.dlp_v2.DlpServiceClient() parent = dlp.project_path(project_id) deidentify_template=f"projects/{project_id}/deidentifyTemplates/{template_id}" response = dlp.deidentify_content(parent, deidentify_template_name=deidentify_template,item=item) return response
def deidentify_with_replace_infotype(project, item, info_types): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by replacing it with the info type. Args: project: The Google Cloud project id to use as a parent resource. item: The string to deidentify (will be treated as text). info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Construct inspect configuration dictionary inspect_config = { "info_types": [{ "name": info_type } for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { "info_type_transformations": { "transformations": [{ "primitive_transformation": { "replace_with_info_type_config": {} } }] } } # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item={"value": item}, ) # Print out the results. print(response.item.value)
def deidentify_with_mask(project, string, info_types, masking_character=None, number_to_mask=0): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Construct inspect configuration dictionary inspect_config = { 'info_types': [{'name': info_type} for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { 'transformations': [ { 'primitive_transformation': { 'character_mask_config': { 'masking_character': masking_character, 'number_to_mask': number_to_mask } } } ] } } # Construct item item = {'value': string} # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item) return response.item.value
def deidentify_with_mask(data,done): # Convert the project id into a full resource id. parent = dlp.project_path(PROJECT_ID) # Construct inspect configuration dictionary inspect_config = { 'info_types': [{'name': info_type} for info_type in INFO_TYPES] } # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { 'transformations': [ { 'primitive_transformation': { 'character_mask_config': { 'masking_character': 'X', 'number_to_mask': 0 } } } ] } } storage_client = storage.Client() bucket = storage_client.get_bucket(SENSITIVE_BUCKET) blobs = bucket.list_blobs() for blob in blobs: gcs_file = blob.download_as_string() #contents = gcs_file.readline() item = {'value': gcs_file} # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item) masked_item = response.item.value destination_bucket = storage_client.get_bucket(MASKED_BUCKET) masked_blob = Blob(blob.name,destination_bucket) masked_blob.upload_from_string(masked_item)
def deidentify_with_fpe(project, string, info_types, alphabet=None, surrogate_type=None, key_name=None, wrapped_key=None): """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Format Preserving Encryption (FPE). """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # The wrapped key is base64-encoded, but the library expects a binary # string, so decode it here. import base64 wrapped_key = base64.b64decode(wrapped_key) # Construct FPE configuration dictionary crypto_replace_ffx_fpe_config = { 'crypto_key': { 'kms_wrapped': { 'wrapped_key': wrapped_key, 'crypto_key_name': key_name } }, 'common_alphabet': alphabet } # Add surrogate type if surrogate_type: crypto_replace_ffx_fpe_config['surrogate_info_type'] = { 'name': surrogate_type } # Construct inspect configuration dictionary inspect_config = { 'info_types': [{'name': info_type} for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { 'transformations': [ { 'primitive_transformation': { 'crypto_replace_ffx_fpe_config': crypto_replace_ffx_fpe_config } } ] } } # Convert string to item item = {'value': string} # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item) return response.item.value
def deidentify_with_fpe(project, string, info_types, alphabet=None, surrogate_type=None, key_name=None, wrapped_key=None): """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Format Preserving Encryption (FPE). Args: project: The Google Cloud project id to use as a parent resource. item: The string to deidentify (will be treated as text). alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet surrogate_type: The name of the surrogate custom info type to use. Only necessary if you want to reverse the deidentification process. Can be essentially any arbitrary string, as long as it doesn't appear in your dataset otherwise. key_name: The name of the Cloud KMS key used to encrypt ('wrap') the AES-256 key. Example: key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key should be encrypted using the Cloud KMS key specified by key_name. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # The wrapped key is base64-encoded, but the library expects a binary # string, so decode it here. import base64 wrapped_key = base64.b64decode(wrapped_key) # Construct FPE configuration dictionary crypto_replace_ffx_fpe_config = { 'crypto_key': { 'kms_wrapped': { 'wrapped_key': wrapped_key, 'crypto_key_name': key_name } }, 'common_alphabet': alphabet } # Add surrogate type if surrogate_type: crypto_replace_ffx_fpe_config['surrogate_info_type'] = { 'name': surrogate_type } # Construct inspect configuration dictionary inspect_config = { 'info_types': [{ 'name': info_type } for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { 'transformations': [{ 'primitive_transformation': { 'crypto_replace_ffx_fpe_config': crypto_replace_ffx_fpe_config } }] } } # Convert string to item item = {'value': string} # Call the API response = dlp.deidentify_content(parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item) # Print results print(response.item.value)
def deidentify_with_date_shift(project, input_csv_file=None, output_csv_file=None, date_fields=None, lower_bound_days=None, upper_bound_days=None, context_field_id=None, wrapped_key=None, key_name=None): """Uses the Data Loss Prevention API to deidentify dates in a CSV file by pseudorandomly shifting them. Args: project: The Google Cloud project id to use as a parent resource. input_csv_file: The path to the CSV file to deidentify. The first row of the file must specify column names, and all other rows must contain valid values. output_csv_file: The path to save the date-shifted CSV file. date_fields: The list of (date) fields in the CSV file to date shift. Example: ['birth_date', 'register_date'] lower_bound_days: The maximum number of days to shift a date backward upper_bound_days: The maximum number of days to shift a date forward context_field_id: (Optional) The column to determine date shift amount based on. If this is not specified, a random shift amount will be used for every row. If this is specified, then 'wrappedKey' and 'keyName' must also be set. Example: contextFieldId = [{ 'name': 'user_id' }] key_name: (Optional) The name of the Cloud KMS key used to encrypt ('wrap') the AES-256 key. Example: key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use. This key should be encrypted using the Cloud KMS key specified by key_name. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Convert date field list to Protobuf type def map_fields(field): return {'name': field} if date_fields: date_fields = map(map_fields, date_fields) else: date_fields = [] # Read and parse the CSV file import csv from datetime import datetime f = [] with open(input_csv_file, 'r') as csvfile: reader = csv.reader(csvfile) for row in reader: f.append(row) # Helper function for converting CSV rows to Protobuf types def map_headers(header): return {'name': header} def map_data(value): try: date = datetime.strptime(value, '%m/%d/%Y') return { 'date_value': { 'year': date.year, 'month': date.month, 'day': date.day } } except ValueError: return {'string_value': value} def map_rows(row): return {'values': map(map_data, row)} # Using the helper functions, convert CSV rows to protobuf-compatible # dictionaries. csv_headers = map(map_headers, f[0]) csv_rows = map(map_rows, f[1:]) # Construct the table dict table_item = {'table': {'headers': csv_headers, 'rows': csv_rows}} # Construct date shift config date_shift_config = { 'lower_bound_days': lower_bound_days, 'upper_bound_days': upper_bound_days } # If using a Cloud KMS key, add it to the date_shift_config. # The wrapped key is base64-encoded, but the library expects a binary # string, so decode it here. if context_field_id and key_name and wrapped_key: import base64 date_shift_config['context'] = {'name': context_field_id} date_shift_config['crypto_key'] = { 'kms_wrapped': { 'wrapped_key': base64.b64decode(wrapped_key), 'crypto_key_name': key_name } } elif context_field_id or key_name or wrapped_key: raise ValueError("""You must set either ALL or NONE of [context_field_id, key_name, wrapped_key]!""") # Construct Deidentify Config deidentify_config = { 'record_transformations': { 'field_transformations': [{ 'fields': date_fields, 'primitive_transformation': { 'date_shift_config': date_shift_config } }] } } # Write to CSV helper methods def write_header(header): return header.name def write_data(data): return data.string_value or '%s/%s/%s' % ( data.date_value.month, data.date_value.day, data.date_value.year) # Call the API response = dlp.deidentify_content(parent, deidentify_config=deidentify_config, item=table_item) # Write results to CSV file with open(output_csv_file, 'w') as csvfile: write_file = csv.writer(csvfile, delimiter=',') write_file.writerow(map(write_header, response.item.table.headers)) for row in response.item.table.rows: write_file.writerow(map(write_data, row.values)) # Print status print('Successfully saved date-shift output to {}'.format(output_csv_file))
def deidentify_with_fpe( self, input_str, info_types, key_ring_id, key_id, name, alphabet, surrogate_type=None ): project = self.project_id location = self.location_id key_name = f"projects/{project}/locations/global/keyRings/{key_ring_id}/cryptoKeys/{key_id}" # Storing the wrapped key inside json file self.export_wrap = 1 # Instantiate a client dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = f"projects/{project}" # wrapped_key = base64.b64decode(wrapped_key) # Construct FPE configuration dictionary crypto_replace_ffx_fpe_config = { "crypto_key": { "kms_wrapped": {"wrapped_key": self.wrapped_key, "crypto_key_name": key_name} }, "custom_alphabet": alphabet, } # Add surrogate type if surrogate_type: crypto_replace_ffx_fpe_config["surrogate_info_type"] = { "name": surrogate_type} # Construct inspect configuration dictionary inspect_config = {"info_types": [ {"name": info_type} for info_type in info_types]} # Construct deidentify configuration dictionary deidentify_config = { "info_type_transformations": { "transformations": [ { "primitive_transformation": { "crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config } } ] } } # Convert string to item item = {"value": input_str} # Call the API response = dlp.deidentify_content( request={ "parent": parent, "deidentify_config": deidentify_config, "inspect_config": inspect_config, "item": item, } ) # Print results return response.item.value
def deidentify_with_cdc( project, info_types, surrogate_type, key_name, wrapped_key, data_item, data_headers=None, alphabet=None, ): """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Format Preserving Encryption (FPE). Args: project: The Google Cloud project id to use as a parent resource in which DLP API is enabled. data_item: The string to deidentify (will be treated as text). alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet surrogate_type: The name of the surrogate custom info type to use. Only necessary if you want to reverse the deidentification process. Can be essentially any arbitrary string, as long as it doesn't appear in your dataset otherwise. key_name: The name of the Cloud KMS key used to encrypt ('wrap') the AES-256 key. Example: key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key should be encrypted using the Cloud KMS key specified by key_name. Returns: None; the response from the API is printed to the terminal. """ # Instantiate a client dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # The wrapped key is base64-encoded, but the library expects a binary # string, so decode it here. import base64 wrapped_key = base64.b64decode(wrapped_key) # Construct CyrptoDeterministicConfig configuration dictionary crypto_deterministic_config = { "crypto_key": { "kms_wrapped": { "wrapped_key": wrapped_key, "crypto_key_name": key_name, } } } # Add surrogate type if surrogate_type: crypto_deterministic_config["surrogate_info_type"] = { "name": surrogate_type } # Construct inspect configuration dictionary inspect_config = { "info_types": [{"name": info_type} for info_type in info_types], "min_likelihood": "POSSIBLE" } # Construct deidentify configuration dictionary deidentify_config = { "info_type_transformations": { "transformations": [ { "primitive_transformation": { "crypto_deterministic_config": crypto_deterministic_config } } ] } } # Construct the table dict #table_item = { # "table": { # "headers": [{"name": header} for header in data_headers], # "rows": [{"values": [{"string_value": key}, {"string_value": value}]} for key, value in data_items.items()] # } #} # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=data_item, ) return response.item.table
def deidentify(text, action): if action == "mask": action = "character_mask" config = action + "_config" dlp = google.cloud.dlp_v2.DlpServiceClient() parent = dlp.project_path('roi-gcp-demos') inspect_config = { "info_types": [ { "name": "EMAIL_ADDRESS" }, { "name": "CREDIT_CARD_NUMBER" }, { "name": "GENERIC_ID" }, { "name": "IP_ADDRESS" }, { "name": "PHONE_NUMBER" }, { "name": "US_DRIVERS_LICENSE_NUMBER" }, { "name": "US_SOCIAL_SECURITY_NUMBER" }, ] } replace_config = {"new_value": {"string_value": "[REDACTED]"}} redact_config = {} character_mask_config = { "masking_character": "#", "number_to_mask": len(text) - 4, "characters_to_ignore": [{ "characters_to_skip": "(),-/,@,." }] } deidentify_config = { "info_type_transformations": { "transformations": [{ "primitive_transformation": { config: locals()[config] } }] } } item = {"value": text} # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item, ) # Print out the results. return {"result": "<br>".join(response.item.value.split("\n"))}
def deidentify_free_text_with_fpe_using_surrogate( project, input_str, alphabet="NUMERIC", info_type="PHONE_NUMBER", surrogate_type="PHONE_TOKEN", unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==", ): """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Format Preserving Encryption (FPE). The encryption is performed with an unwrapped key. Args: project: The Google Cloud project id to use as a parent resource. input_str: The string to deidentify (will be treated as text). alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet info_type: The name of the info type to de-identify surrogate_type: The name of the surrogate custom info type to use. Can be essentially any arbitrary string, as long as it doesn't appear in your dataset otherwise. unwrapped_key: The base64-encoded AES-256 key to use. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # The unwrapped key is base64-encoded, but the library expects a binary # string, so decode it here. import base64 unwrapped_key = base64.b64decode(unwrapped_key) # Construct de-identify config transformation = { "info_types": [{ "name": info_type }], "primitive_transformation": { "crypto_replace_ffx_fpe_config": { "crypto_key": { "unwrapped": { "key": unwrapped_key } }, "common_alphabet": alphabet, "surrogate_info_type": { "name": surrogate_type }, } } } deidentify_config = { "info_type_transformations": { "transformations": [transformation] } } # Construct the inspect config, trying to finding all PII with likelihood # higher than UNLIKELY inspect_config = { "info_types": [{ "name": info_type }], "min_likelihood": "UNLIKELY" } # Convert string to item item = {"value": input_str} # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item, ) # Print results print(response.item.value)
def deidentify_with_date_shift(project, input_csv_file=None, output_csv_file=None, date_fields=None, lower_bound_days=None, upper_bound_days=None, context_field_id=None, wrapped_key=None, key_name=None): """Uses the Data Loss Prevention API to deidentify dates in a CSV file by pseudorandomly shifting them. Args: project: The Google Cloud project id to use as a parent resource. input_csv_file: The path to the CSV file to deidentify. The first row of the file must specify column names, and all other rows must contain valid values. output_csv_file: The path to save the date-shifted CSV file. date_fields: The list of (date) fields in the CSV file to date shift. Example: ['birth_date', 'register_date'] lower_bound_days: The maximum number of days to shift a date backward upper_bound_days: The maximum number of days to shift a date forward context_field_id: (Optional) The column to determine date shift amount based on. If this is not specified, a random shift amount will be used for every row. If this is specified, then 'wrappedKey' and 'keyName' must also be set. Example: contextFieldId = [{ 'name': 'user_id' }] key_name: (Optional) The name of the Cloud KMS key used to encrypt ('wrap') the AES-256 key. Example: key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use. This key should be encrypted using the Cloud KMS key specified by key_name. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Convert date field list to Protobuf type def map_fields(field): return {'name': field} if date_fields: date_fields = map(map_fields, date_fields) else: date_fields = [] # Read and parse the CSV file import csv from datetime import datetime f = [] with open(input_csv_file, 'r') as csvfile: reader = csv.reader(csvfile) for row in reader: f.append(row) # Helper function for converting CSV rows to Protobuf types def map_headers(header): return {'name': header} def map_data(value): try: date = datetime.strptime(value, '%m/%d/%Y') return { 'date_value': { 'year': date.year, 'month': date.month, 'day': date.day } } except ValueError: return {'string_value': value} def map_rows(row): return {'values': map(map_data, row)} # Using the helper functions, convert CSV rows to protobuf-compatible # dictionaries. csv_headers = map(map_headers, f[0]) csv_rows = map(map_rows, f[1:]) # Construct the table dict table_item = { 'table': { 'headers': csv_headers, 'rows': csv_rows } } # Construct date shift config date_shift_config = { 'lower_bound_days': lower_bound_days, 'upper_bound_days': upper_bound_days } # If using a Cloud KMS key, add it to the date_shift_config. # The wrapped key is base64-encoded, but the library expects a binary # string, so decode it here. if context_field_id and key_name and wrapped_key: import base64 date_shift_config['context'] = {'name': context_field_id} date_shift_config['crypto_key'] = { 'kms_wrapped': { 'wrapped_key': base64.b64decode(wrapped_key), 'crypto_key_name': key_name } } elif context_field_id or key_name or wrapped_key: raise ValueError("""You must set either ALL or NONE of [context_field_id, key_name, wrapped_key]!""") # Construct Deidentify Config deidentify_config = { 'record_transformations': { 'field_transformations': [ { 'fields': date_fields, 'primitive_transformation': { 'date_shift_config': date_shift_config } } ] } } # Write to CSV helper methods def write_header(header): return header.name def write_data(data): return data.string_value or '%s/%s/%s' % (data.date_value.month, data.date_value.day, data.date_value.year) # Call the API response = dlp.deidentify_content( parent, deidentify_config=deidentify_config, item=table_item) # Write results to CSV file with open(output_csv_file, 'w') as csvfile: write_file = csv.writer(csvfile, delimiter=',') write_file.writerow(map(write_header, response.item.table.headers)) for row in response.item.table.rows: write_file.writerow(map(write_data, row.values)) # Print status print('Successfully saved date-shift output to {}'.format( output_csv_file))
def deidentify_with_deterministic( project, input_str, info_types, surrogate_type=None, key_name=None, wrapped_key=None, ): """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Deterministic encryption Args: project: The Google Cloud project id to use as a parent resource. input_str: The string to deidentify (will be treated as text). surrogate_type: The name of the surrogate custom info type to use. Only necessary if you want to reverse the deidentification process. Can be essentially any arbitrary string, as long as it doesn't appear in your dataset otherwise. key_name: The name of the Cloud KMS key used to encrypt ('wrap') the AES-256 key. Example: key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key should be encrypted using the Cloud KMS key specified by key_name. Returns: None; the response from the API is printed to the terminal. """ import google.cloud.dlp dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = f"projects/{project}" # The wrapped key is base64-encoded, but the library expects a binary string, so decode it here. import base64 wrapped_key = base64.b64decode(wrapped_key) crypto_replace_deterministic_config = { "crypto_key": { "kms_wrapped": { "wrapped_key": wrapped_key, "crypto_key_name": key_name } }, } # surrogate type is the prefix in the Pseudonymized string if surrogate_type: crypto_replace_deterministic_config["surrogate_info_type"] = { "name": surrogate_type } inspect_config = { "info_types": [{ "name": info_type } for info_type in info_types] } deidentify_config = { "info_type_transformations": { "transformations": [{ "primitive_transformation": { "crypto_deterministic_config": crypto_replace_deterministic_config } }] } } item = {"value": input_str} # Call the DLP API https://cloud.google.com/dlp/docs/pseudonymization response = dlp.deidentify_content( request={ "parent": parent, "deidentify_config": deidentify_config, "inspect_config": inspect_config, "item": item, }) return response.item.value
def redact_text(data, project): info_types = [] deidentify_config = { "info_type_transformations": { "transformations": [{ "primitive_transformation": { "replace_config": { "new_value": { "string_value": '#', } } } }] } } if data['dlp'] == 'true' or data['dlp'] == 'True': dlp = google.cloud.dlp_v2.DlpServiceClient() parent = dlp.project_path(project) response = dlp.list_info_types('en-US') # This will use all info types available, you can narrow it to a list or template for info_type in response.info_types: info_types.append({'name': info_type.name}) inspect_config = {"info_types": info_types} item = {"value": data['transcript']} response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item, ) data['transcript'] = response.item.value for words_element in data['words']: item = {"value": words_element['word']} response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item, ) words_element['word'] = response.item.value for entities_element in data['entities']: item = {"value": entities_element['name']} response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item, ) entities_element['name'] = response.item.value for sentences_element in data['sentences']: item = {"value": sentences_element['sentence']} response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item, ) sentences_element['sentence'] = response.item.value return data
def deidentify_with_fpe(project, string, info_types, alphabet=None, surrogate_type=None, key_name=None, wrapped_key=None): """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Format Preserving Encryption (FPE). Args: project: The Google Cloud project id to use as a parent resource. item: The string to deidentify (will be treated as text). alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet surrogate_type: The name of the surrogate custom info type to use. Only necessary if you want to reverse the deidentification process. Can be essentially any arbitrary string, as long as it doesn't appear in your dataset otherwise. key_name: The name of the Cloud KMS key used to encrypt ('wrap') the AES-256 key. Example: key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key should be encrypted using the Cloud KMS key specified by key_name. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # The wrapped key is base64-encoded, but the library expects a binary # string, so decode it here. import base64 wrapped_key = base64.b64decode(wrapped_key) # Construct FPE configuration dictionary crypto_replace_ffx_fpe_config = { 'crypto_key': { 'kms_wrapped': { 'wrapped_key': wrapped_key, 'crypto_key_name': key_name } }, 'common_alphabet': alphabet } # Add surrogate type if surrogate_type: crypto_replace_ffx_fpe_config['surrogate_info_type'] = { 'name': surrogate_type } # Construct inspect configuration dictionary inspect_config = { 'info_types': [{'name': info_type} for info_type in info_types] } # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { 'transformations': [ { 'primitive_transformation': { 'crypto_replace_ffx_fpe_config': crypto_replace_ffx_fpe_config } } ] } } # Convert string to item item = {'value': string} # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item) # Print results print(response.item.value)
def redact_DocumentTypes(project, filename, output_filename, mime_type=None): """Uses the Data Loss Prevention API to redact protected data in an image. Args: project: The Google Cloud project id to use as a parent resource. filename: The path to the file to inspect. output_filename: The path to which the redacted image will be written. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. mime_type: The MIME type of the file. If not specified, the type is inferred via the Python standard library's mimetypes module. Returns: None; the response from the API is printed to the terminal. """ os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = "kubernetes-e9dc8af4883c.json" # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). # info_types = [{'name': info_type} for info_type in info_types] # Prepare image_redaction_configs, a list of dictionaries. Each dictionary # contains an info_type and optionally the color used for the replacement. # The color is omitted in this sample, so the default (black) will be used. image_redaction_configs = [] # if info_types is not None: # for info_type in info_types: # image_redaction_configs.append({'info_type': info_type}) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. # inspect_config = { # 'min_likelihood': min_likelihood, # 'info_types': info_types, # } # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] or 'application/octet-stream' # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" 'image/jpeg': 1, 'image/bmp': 2, 'image/png': 3, 'image/svg': 4, 'text/plain': 5, } content_type_index = supported_content_types.get(mime_type, 0) # Construct the byte_item, containing the file's byte data. with open(filename, mode='rb') as f: a = f.read() byte_item = {'type': content_type_index, 'data': a} print(byte_item) # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. # response = dlp.redact_image( # parent, inspect_config=inspect_config, # image_redaction_configs=image_redaction_configs, # byte_item=byte_item) deidentify_config = { "info_type_transformations": { "transformations": [{ "primitive_transformation": { "replace_config": { "new_value": { "string_value": "[XXXXXXXXXXXX]" } } } }] } } inspect_config = {"info_types": [{"name": "ALL_BASIC"}]} item = {"byte_item": byte_item} response = dlp.deidentify_content(parent, inspect_config=inspect_config, deidentify_config=deidentify_config, item=item) print(response.item.byte_item.data) # Write out the results. with open(output_filename, mode='wb') as f: f.write(response.item.byte_item.data) print('Written')