def deidentify_with_redact(self, input_str, info_types):

        project = self.project_id

        # Instantiate a client
        dlp = google.cloud.dlp_v2.DlpServiceClient()
        # Convert the project id into a full resource id.
        parent = f"projects/{project}"
        # Construct inspect configuration dictionary
        inspect_config = {"info_types": [
            {"name": info_type} for info_type in info_types]}

        # Construct deidentify configuration dictionary
        deidentify_config = {
            "info_type_transformations": {
                "transformations": [{"primitive_transformation": {"redact_config": {}}}]
            }
        }
        # Construct item
        item = {"value": input_str}

        # Call the API
        response = dlp.deidentify_content(
            request={
                "parent": parent,
                "deidentify_config": deidentify_config,
                "inspect_config": inspect_config,
                "item": item,
            }
        )

        # Print out the results.
        return response.item.value
 def process(self, elem):
     import google.cloud.dlp
     dlp = google.cloud.dlp_v2.DlpServiceClient()
     data = {"header": ["EMAIL"]}
     headers = [{"name": val} for val in data["header"]]
     rows = []
     for event in elem:
         rows.append({"values": [{"string_value": str(event.email)}]})
     table = {}
     table["headers"] = headers
     table["rows"] = rows
     item = {"table": table}
     parent = dlp.project_path(self.project_id)
     deidentify_template_name = "projects/{}/deidentifyTemplates/{}".format(
         self.project_id, self.template_id.get())
     response = dlp.deidentify_content(
         parent,
         deidentify_template_name=deidentify_template_name,
         item=item)
     i = 0
     for event in elem:
         user = User(event.full_name,
                     response.item.table.rows[i].values[0].string_value,
                     event.job, event.city)
         i = i + 1
         yield user.obj_to_Row()
Exemple #3
0
def deidentify_with_replace(
    project,
    input_str,
    info_types,
    replacement_str="REPLACEMENT_STR",
):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by replacing matched input values with a value you specify.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        input_str: The string to deidentify (will be treated as text).
        info_types: A list of strings representing info types to look for.
        replacement_str: The string to replace all values that match given
            info types.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        "info_types": [{
            "name": info_type
        } for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [{
                "primitive_transformation": {
                    "replace_config": {
                        "new_value": {
                            "string_value": replacement_str,
                        }
                    }
                }
            }]
        }
    }

    # Construct item
    item = {"value": input_str}

    # Call the API
    response = dlp.deidentify_content(
        parent,
        inspect_config=inspect_config,
        deidentify_config=deidentify_config,
        item=item,
    )

    # Print out the results.
    print(response.item.value)
Exemple #4
0
def deidentify_with_mask(
    project, input_str, info_types, masking_character=None, number_to_mask=0
):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by masking it with a character.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        input_str: The string to deidentify (will be treated as text).
        masking_character: The character to mask matching sensitive data with.
        number_to_mask: The maximum number of sensitive characters to mask in
            a match. If omitted or set to zero, the API will default to no
            maximum.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Construct inspect configuration dictionary
    inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {
                    "primitive_transformation": {
                        "character_mask_config": {
                            "masking_character": masking_character,
                            "number_to_mask": number_to_mask,
                        }
                    }
                }
            ]
        }
    }

    # Construct item
    item = {"value": input_str}

    # Call the API
    response = dlp.deidentify_content(
        request={
            "parent": parent,
            "deidentify_config": deidentify_config,
            "inspect_config": inspect_config,
            "item": item,
        }
    )

    # Print out the results.
    print(response.item.value)
Exemple #5
0
def deidentify_with_mask(project,
                         string,
                         info_types,
                         masking_character=None,
                         number_to_mask=0):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by masking it with a character.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        masking_character: The character to mask matching sensitive data with.
        number_to_mask: The maximum number of sensitive characters to mask in
            a match. If omitted or set to zero, the API will default to no
            maximum.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{
            'name': info_type
        } for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [{
                'primitive_transformation': {
                    'character_mask_config': {
                        'masking_character': masking_character,
                        'number_to_mask': number_to_mask
                    }
                }
            }]
        }
    }

    # Construct item
    item = {'value': string}

    # Call the API
    response = dlp.deidentify_content(parent,
                                      inspect_config=inspect_config,
                                      deidentify_config=deidentify_config,
                                      item=item)

    # Print out the results.
    print(response.item.value)
def deidentify_with_mask(project, string, info_types, masking_character=None,
                         number_to_mask=0):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by masking it with a character.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        masking_character: The character to mask matching sensitive data with.
        number_to_mask: The maximum number of sensitive characters to mask in
            a match. If omitted or set to zero, the API will default to no
            maximum.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{'name': info_type} for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [
                {
                    'primitive_transformation': {
                        'character_mask_config': {
                            'masking_character': masking_character,
                            'number_to_mask': number_to_mask
                        }
                    }
                }
            ]
        }
    }

    # Construct item
    item = {'value': string}

    # Call the API
    response = dlp.deidentify_content(
        parent, inspect_config=inspect_config,
        deidentify_config=deidentify_config, item=item)

    # Print out the results.
    print(response.item.value)
Exemple #7
0
def deidentify_with_redact(
    project,
    input_str,
    info_types,
):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by redacting matched input values.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        input_str: The string to deidentify (will be treated as text).
        info_types: A list of strings representing info types to look for.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Construct inspect configuration dictionary
    inspect_config = {
        "info_types": [{
            "name": info_type
        } for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [{
                "primitive_transformation": {
                    "redact_config": {}
                }
            }]
        }
    }

    # Construct item
    item = {"value": input_str}

    # Call the API
    response = dlp.deidentify_content(
        request={
            "parent": parent,
            "deidentify_config": deidentify_config,
            "inspect_config": inspect_config,
            "item": item,
        })

    # Print out the results.
    print(response.item.value)
 def deindentify(self, data, project_id, template_id):
     headers = [{"name": val} for val in data["header"]]
     rows = []
     for row in data["rows"]:
         rows.append( {"values": [{"string_value": cell_val} for cell_val in row]} )
     table = {}
     table["headers"] = headers
     table["rows"] = rows
     item = {"table": table}
     dlp = google.cloud.dlp_v2.DlpServiceClient()
     parent = dlp.project_path(project_id)
     deidentify_template=f"projects/{project_id}/deidentifyTemplates/{template_id}"
     response = dlp.deidentify_content(parent, deidentify_template_name=deidentify_template,item=item)
     return response
Exemple #9
0
def deidentify_with_replace_infotype(project, item, info_types):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by replacing it with the info type.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        "info_types": [{
            "name": info_type
        } for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [{
                "primitive_transformation": {
                    "replace_with_info_type_config": {}
                }
            }]
        }
    }

    # Call the API
    response = dlp.deidentify_content(
        parent,
        inspect_config=inspect_config,
        deidentify_config=deidentify_config,
        item={"value": item},
    )

    # Print out the results.
    print(response.item.value)
Exemple #10
0
def deidentify_with_mask(project, string, info_types, masking_character=None,
                         number_to_mask=0):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by masking it with a character.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{'name': info_type} for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [
                {
                    'primitive_transformation': {
                        'character_mask_config': {
                            'masking_character': masking_character,
                            'number_to_mask': number_to_mask
                        }
                    }
                }
            ]
        }
    }

    # Construct item
    item = {'value': string}

    # Call the API
    response = dlp.deidentify_content(
        parent, inspect_config=inspect_config,
        deidentify_config=deidentify_config, item=item)

    return response.item.value
Exemple #11
0
def deidentify_with_mask(data,done):

    # Convert the project id into a full resource id.
    parent = dlp.project_path(PROJECT_ID)

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{'name': info_type} for info_type in INFO_TYPES]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [
                {
                    'primitive_transformation': {
                        'character_mask_config': {
                            'masking_character': 'X',
                            'number_to_mask': 0
                        }
                    }
                }
            ]
        }
    }


    storage_client = storage.Client()
    bucket = storage_client.get_bucket(SENSITIVE_BUCKET)

    blobs = bucket.list_blobs()

    for blob in blobs:
        gcs_file = blob.download_as_string()
        #contents = gcs_file.readline()
        item = {'value': gcs_file}
         # Call the API
        response = dlp.deidentify_content(
        parent, inspect_config=inspect_config,
        deidentify_config=deidentify_config, item=item)    
        masked_item = response.item.value
        destination_bucket = storage_client.get_bucket(MASKED_BUCKET)
        masked_blob = Blob(blob.name,destination_bucket)
        masked_blob.upload_from_string(masked_item)
Exemple #12
0
def deidentify_with_fpe(project, string, info_types, alphabet=None,
                surrogate_type=None, key_name=None, wrapped_key=None):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string using Format Preserving Encryption (FPE).
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # The wrapped key is base64-encoded, but the library expects a binary
    # string, so decode it here.
    import base64
    wrapped_key = base64.b64decode(wrapped_key)

    # Construct FPE configuration dictionary
    crypto_replace_ffx_fpe_config = {
        'crypto_key': {
            'kms_wrapped': {
                'wrapped_key': wrapped_key,
                'crypto_key_name': key_name
            }
        },
        'common_alphabet': alphabet
    }

    # Add surrogate type
    if surrogate_type:
        crypto_replace_ffx_fpe_config['surrogate_info_type'] = {
            'name': surrogate_type
        }

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{'name': info_type} for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [
                {
                    'primitive_transformation': {
                        'crypto_replace_ffx_fpe_config':
                            crypto_replace_ffx_fpe_config
                    }
                }
            ]
        }
    }

    # Convert string to item
    item = {'value': string}

    # Call the API
    response = dlp.deidentify_content(
        parent, inspect_config=inspect_config,
        deidentify_config=deidentify_config, item=item)

    return response.item.value
Exemple #13
0
def deidentify_with_fpe(project,
                        string,
                        info_types,
                        alphabet=None,
                        surrogate_type=None,
                        key_name=None,
                        wrapped_key=None):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string using Format Preserving Encryption (FPE).
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        alphabet: The set of characters to replace sensitive ones with. For
            more information, see https://cloud.google.com/dlp/docs/reference/
            rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
        surrogate_type: The name of the surrogate custom info type to use. Only
            necessary if you want to reverse the deidentification process. Can
            be essentially any arbitrary string, as long as it doesn't appear
            in your dataset otherwise.
        key_name: The name of the Cloud KMS key used to encrypt ('wrap') the
            AES-256 key. Example:
            key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
            keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
        wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key
            should be encrypted using the Cloud KMS key specified by key_name.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # The wrapped key is base64-encoded, but the library expects a binary
    # string, so decode it here.
    import base64
    wrapped_key = base64.b64decode(wrapped_key)

    # Construct FPE configuration dictionary
    crypto_replace_ffx_fpe_config = {
        'crypto_key': {
            'kms_wrapped': {
                'wrapped_key': wrapped_key,
                'crypto_key_name': key_name
            }
        },
        'common_alphabet': alphabet
    }

    # Add surrogate type
    if surrogate_type:
        crypto_replace_ffx_fpe_config['surrogate_info_type'] = {
            'name': surrogate_type
        }

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{
            'name': info_type
        } for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [{
                'primitive_transformation': {
                    'crypto_replace_ffx_fpe_config':
                    crypto_replace_ffx_fpe_config
                }
            }]
        }
    }

    # Convert string to item
    item = {'value': string}

    # Call the API
    response = dlp.deidentify_content(parent,
                                      inspect_config=inspect_config,
                                      deidentify_config=deidentify_config,
                                      item=item)

    # Print results
    print(response.item.value)
Exemple #14
0
def deidentify_with_date_shift(project,
                               input_csv_file=None,
                               output_csv_file=None,
                               date_fields=None,
                               lower_bound_days=None,
                               upper_bound_days=None,
                               context_field_id=None,
                               wrapped_key=None,
                               key_name=None):
    """Uses the Data Loss Prevention API to deidentify dates in a CSV file by
        pseudorandomly shifting them.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        input_csv_file: The path to the CSV file to deidentify. The first row
            of the file must specify column names, and all other rows must
            contain valid values.
        output_csv_file: The path to save the date-shifted CSV file.
        date_fields: The list of (date) fields in the CSV file to date shift.
            Example: ['birth_date', 'register_date']
        lower_bound_days: The maximum number of days to shift a date backward
        upper_bound_days: The maximum number of days to shift a date forward
        context_field_id: (Optional) The column to determine date shift amount
            based on. If this is not specified, a random shift amount will be
            used for every row. If this is specified, then 'wrappedKey' and
            'keyName' must also be set. Example:
            contextFieldId = [{ 'name': 'user_id' }]
        key_name: (Optional) The name of the Cloud KMS key used to encrypt
            ('wrap') the AES-256 key. Example:
            key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
            keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
        wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use.
            This key should be encrypted using the Cloud KMS key specified by
            key_name.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Convert date field list to Protobuf type
    def map_fields(field):
        return {'name': field}

    if date_fields:
        date_fields = map(map_fields, date_fields)
    else:
        date_fields = []

    # Read and parse the CSV file
    import csv
    from datetime import datetime
    f = []
    with open(input_csv_file, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            f.append(row)

    #  Helper function for converting CSV rows to Protobuf types
    def map_headers(header):
        return {'name': header}

    def map_data(value):
        try:
            date = datetime.strptime(value, '%m/%d/%Y')
            return {
                'date_value': {
                    'year': date.year,
                    'month': date.month,
                    'day': date.day
                }
            }
        except ValueError:
            return {'string_value': value}

    def map_rows(row):
        return {'values': map(map_data, row)}

    # Using the helper functions, convert CSV rows to protobuf-compatible
    # dictionaries.
    csv_headers = map(map_headers, f[0])
    csv_rows = map(map_rows, f[1:])

    # Construct the table dict
    table_item = {'table': {'headers': csv_headers, 'rows': csv_rows}}

    # Construct date shift config
    date_shift_config = {
        'lower_bound_days': lower_bound_days,
        'upper_bound_days': upper_bound_days
    }

    # If using a Cloud KMS key, add it to the date_shift_config.
    # The wrapped key is base64-encoded, but the library expects a binary
    # string, so decode it here.
    if context_field_id and key_name and wrapped_key:
        import base64
        date_shift_config['context'] = {'name': context_field_id}
        date_shift_config['crypto_key'] = {
            'kms_wrapped': {
                'wrapped_key': base64.b64decode(wrapped_key),
                'crypto_key_name': key_name
            }
        }
    elif context_field_id or key_name or wrapped_key:
        raise ValueError("""You must set either ALL or NONE of
        [context_field_id, key_name, wrapped_key]!""")

    # Construct Deidentify Config
    deidentify_config = {
        'record_transformations': {
            'field_transformations': [{
                'fields': date_fields,
                'primitive_transformation': {
                    'date_shift_config': date_shift_config
                }
            }]
        }
    }

    # Write to CSV helper methods
    def write_header(header):
        return header.name

    def write_data(data):
        return data.string_value or '%s/%s/%s' % (
            data.date_value.month, data.date_value.day, data.date_value.year)

    # Call the API
    response = dlp.deidentify_content(parent,
                                      deidentify_config=deidentify_config,
                                      item=table_item)

    # Write results to CSV file
    with open(output_csv_file, 'w') as csvfile:
        write_file = csv.writer(csvfile, delimiter=',')
        write_file.writerow(map(write_header, response.item.table.headers))
        for row in response.item.table.rows:
            write_file.writerow(map(write_data, row.values))
    # Print status
    print('Successfully saved date-shift output to {}'.format(output_csv_file))
    def deidentify_with_fpe(
        self,
        input_str,
        info_types,
        key_ring_id,
        key_id,
        name,
        alphabet,
        surrogate_type=None
    ):
        project = self.project_id
        location = self.location_id
        key_name = f"projects/{project}/locations/global/keyRings/{key_ring_id}/cryptoKeys/{key_id}"

        # Storing the wrapped key inside json file
        self.export_wrap = 1

        # Instantiate a client
        dlp = google.cloud.dlp_v2.DlpServiceClient()

        # Convert the project id into a full resource id.
        parent = f"projects/{project}"

        # wrapped_key = base64.b64decode(wrapped_key)

        # Construct FPE configuration dictionary
        crypto_replace_ffx_fpe_config = {
            "crypto_key": {
                "kms_wrapped": {"wrapped_key": self.wrapped_key, "crypto_key_name": key_name}
            },
            "custom_alphabet": alphabet,
        }

        # Add surrogate type
        if surrogate_type:
            crypto_replace_ffx_fpe_config["surrogate_info_type"] = {
                "name": surrogate_type}

        # Construct inspect configuration dictionary
        inspect_config = {"info_types": [
            {"name": info_type} for info_type in info_types]}

        # Construct deidentify configuration dictionary
        deidentify_config = {
            "info_type_transformations": {
                "transformations": [
                    {
                        "primitive_transformation": {
                            "crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config
                        }
                    }
                ]
            }
        }

        # Convert string to item
        item = {"value": input_str}

        # Call the API
        response = dlp.deidentify_content(
            request={
                "parent": parent,
                "deidentify_config": deidentify_config,
                "inspect_config": inspect_config,
                "item": item,

            }
        )

        # Print results
        return response.item.value
Exemple #16
0
def deidentify_with_cdc(
    project,
    info_types,
    surrogate_type,
    key_name,
    wrapped_key,
    data_item,
    data_headers=None,
    alphabet=None,

):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string using Format Preserving Encryption (FPE).
    Args:
        project: The Google Cloud project id to use as a parent resource in which DLP API is enabled.
        data_item: The string to deidentify (will be treated as text).
        alphabet: The set of characters to replace sensitive ones with. For
            more information, see https://cloud.google.com/dlp/docs/reference/
            rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
        surrogate_type: The name of the surrogate custom info type to use. Only
            necessary if you want to reverse the deidentification process. Can
            be essentially any arbitrary string, as long as it doesn't appear
            in your dataset otherwise.
        key_name: The name of the Cloud KMS key used to encrypt ('wrap') the
            AES-256 key. Example:
            key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
            keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
        wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key
            should be encrypted using the Cloud KMS key specified by key_name.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    
    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # The wrapped key is base64-encoded, but the library expects a binary
    # string, so decode it here.
    import base64

    wrapped_key = base64.b64decode(wrapped_key)

    # Construct CyrptoDeterministicConfig configuration dictionary
    crypto_deterministic_config = {
        "crypto_key": {
            "kms_wrapped": {
                "wrapped_key": wrapped_key,
                "crypto_key_name": key_name,
            }
        }
    }

    # Add surrogate type
    if surrogate_type:
        crypto_deterministic_config["surrogate_info_type"] = {
            "name": surrogate_type
        }

    # Construct inspect configuration dictionary
    inspect_config = {
        "info_types": [{"name": info_type} for info_type in info_types],
        "min_likelihood": "POSSIBLE"
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {
                    "primitive_transformation": {
                        "crypto_deterministic_config": crypto_deterministic_config
                    }
                }
            ]
        }
    }

    # Construct the table dict
    #table_item = {
    #    "table": {
    #        "headers": [{"name": header} for header in data_headers], 
    #        "rows": [{"values": [{"string_value": key}, {"string_value": value}]} for key, value in data_items.items()]
    #    }
    #}

    # Call the API
    response = dlp.deidentify_content(
        parent,
        inspect_config=inspect_config,
        deidentify_config=deidentify_config,
        item=data_item,
    )

    return response.item.table
Exemple #17
0
def deidentify(text, action):

    if action == "mask":
        action = "character_mask"

    config = action + "_config"

    dlp = google.cloud.dlp_v2.DlpServiceClient()
    parent = dlp.project_path('roi-gcp-demos')

    inspect_config = {
        "info_types": [
            {
                "name": "EMAIL_ADDRESS"
            },
            {
                "name": "CREDIT_CARD_NUMBER"
            },
            {
                "name": "GENERIC_ID"
            },
            {
                "name": "IP_ADDRESS"
            },
            {
                "name": "PHONE_NUMBER"
            },
            {
                "name": "US_DRIVERS_LICENSE_NUMBER"
            },
            {
                "name": "US_SOCIAL_SECURITY_NUMBER"
            },
        ]
    }

    replace_config = {"new_value": {"string_value": "[REDACTED]"}}

    redact_config = {}

    character_mask_config = {
        "masking_character": "#",
        "number_to_mask": len(text) - 4,
        "characters_to_ignore": [{
            "characters_to_skip": "(),-/,@,."
        }]
    }

    deidentify_config = {
        "info_type_transformations": {
            "transformations": [{
                "primitive_transformation": {
                    config: locals()[config]
                }
            }]
        }
    }

    item = {"value": text}

    # Call the API
    response = dlp.deidentify_content(
        parent,
        inspect_config=inspect_config,
        deidentify_config=deidentify_config,
        item=item,
    )

    # Print out the results.
    return {"result": "<br>".join(response.item.value.split("\n"))}
Exemple #18
0
def deidentify_free_text_with_fpe_using_surrogate(
    project,
    input_str,
    alphabet="NUMERIC",
    info_type="PHONE_NUMBER",
    surrogate_type="PHONE_TOKEN",
    unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==",
):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
       string using Format Preserving Encryption (FPE).
       The encryption is performed with an unwrapped key.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        input_str: The string to deidentify (will be treated as text).
        alphabet: The set of characters to replace sensitive ones with. For
            more information, see https://cloud.google.com/dlp/docs/reference/
            rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
        info_type: The name of the info type to de-identify
        surrogate_type: The name of the surrogate custom info type to use. Can
            be essentially any arbitrary string, as long as it doesn't appear
            in your dataset otherwise.
        unwrapped_key: The base64-encoded AES-256 key to use.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # The unwrapped key is base64-encoded, but the library expects a binary
    # string, so decode it here.
    import base64

    unwrapped_key = base64.b64decode(unwrapped_key)

    # Construct de-identify config
    transformation = {
        "info_types": [{
            "name": info_type
        }],
        "primitive_transformation": {
            "crypto_replace_ffx_fpe_config": {
                "crypto_key": {
                    "unwrapped": {
                        "key": unwrapped_key
                    }
                },
                "common_alphabet": alphabet,
                "surrogate_info_type": {
                    "name": surrogate_type
                },
            }
        }
    }

    deidentify_config = {
        "info_type_transformations": {
            "transformations": [transformation]
        }
    }

    # Construct the inspect config, trying to finding all PII with likelihood
    # higher than UNLIKELY
    inspect_config = {
        "info_types": [{
            "name": info_type
        }],
        "min_likelihood": "UNLIKELY"
    }

    # Convert string to item
    item = {"value": input_str}

    # Call the API
    response = dlp.deidentify_content(
        parent,
        inspect_config=inspect_config,
        deidentify_config=deidentify_config,
        item=item,
    )

    # Print results
    print(response.item.value)
def deidentify_with_date_shift(project, input_csv_file=None,
                               output_csv_file=None, date_fields=None,
                               lower_bound_days=None, upper_bound_days=None,
                               context_field_id=None, wrapped_key=None,
                               key_name=None):
    """Uses the Data Loss Prevention API to deidentify dates in a CSV file by
        pseudorandomly shifting them.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        input_csv_file: The path to the CSV file to deidentify. The first row
            of the file must specify column names, and all other rows must
            contain valid values.
        output_csv_file: The path to save the date-shifted CSV file.
        date_fields: The list of (date) fields in the CSV file to date shift.
            Example: ['birth_date', 'register_date']
        lower_bound_days: The maximum number of days to shift a date backward
        upper_bound_days: The maximum number of days to shift a date forward
        context_field_id: (Optional) The column to determine date shift amount
            based on. If this is not specified, a random shift amount will be
            used for every row. If this is specified, then 'wrappedKey' and
            'keyName' must also be set. Example:
            contextFieldId = [{ 'name': 'user_id' }]
        key_name: (Optional) The name of the Cloud KMS key used to encrypt
            ('wrap') the AES-256 key. Example:
            key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
            keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
        wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use.
            This key should be encrypted using the Cloud KMS key specified by
            key_name.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Convert date field list to Protobuf type
    def map_fields(field):
        return {'name': field}

    if date_fields:
        date_fields = map(map_fields, date_fields)
    else:
        date_fields = []

    # Read and parse the CSV file
    import csv
    from datetime import datetime
    f = []
    with open(input_csv_file, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            f.append(row)

    #  Helper function for converting CSV rows to Protobuf types
    def map_headers(header):
        return {'name': header}

    def map_data(value):
        try:
            date = datetime.strptime(value, '%m/%d/%Y')
            return {
                'date_value': {
                    'year': date.year,
                    'month': date.month,
                    'day': date.day
                }
            }
        except ValueError:
            return {'string_value': value}

    def map_rows(row):
        return {'values': map(map_data, row)}

    # Using the helper functions, convert CSV rows to protobuf-compatible
    # dictionaries.
    csv_headers = map(map_headers, f[0])
    csv_rows = map(map_rows, f[1:])

    # Construct the table dict
    table_item = {
        'table': {
            'headers': csv_headers,
            'rows': csv_rows
        }
    }

    # Construct date shift config
    date_shift_config = {
        'lower_bound_days': lower_bound_days,
        'upper_bound_days': upper_bound_days
    }

    # If using a Cloud KMS key, add it to the date_shift_config.
    # The wrapped key is base64-encoded, but the library expects a binary
    # string, so decode it here.
    if context_field_id and key_name and wrapped_key:
        import base64
        date_shift_config['context'] = {'name': context_field_id}
        date_shift_config['crypto_key'] = {
            'kms_wrapped': {
                'wrapped_key': base64.b64decode(wrapped_key),
                'crypto_key_name': key_name
            }
        }
    elif context_field_id or key_name or wrapped_key:
        raise ValueError("""You must set either ALL or NONE of
        [context_field_id, key_name, wrapped_key]!""")

    # Construct Deidentify Config
    deidentify_config = {
        'record_transformations': {
            'field_transformations': [
                {
                    'fields': date_fields,
                    'primitive_transformation': {
                        'date_shift_config': date_shift_config
                    }
                }
            ]
        }
    }

    # Write to CSV helper methods
    def write_header(header):
        return header.name

    def write_data(data):
        return data.string_value or '%s/%s/%s' % (data.date_value.month,
                                                  data.date_value.day,
                                                  data.date_value.year)

    # Call the API
    response = dlp.deidentify_content(
        parent, deidentify_config=deidentify_config, item=table_item)

    # Write results to CSV file
    with open(output_csv_file, 'w') as csvfile:
        write_file = csv.writer(csvfile, delimiter=',')
        write_file.writerow(map(write_header, response.item.table.headers))
        for row in response.item.table.rows:
            write_file.writerow(map(write_data, row.values))
    # Print status
    print('Successfully saved date-shift output to {}'.format(
                output_csv_file))
Exemple #20
0
def deidentify_with_deterministic(
    project,
    input_str,
    info_types,
    surrogate_type=None,
    key_name=None,
    wrapped_key=None,
):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string using Deterministic encryption
    Args:
        project: The Google Cloud project id to use as a parent resource.
        input_str: The string to deidentify (will be treated as text).
        surrogate_type: The name of the surrogate custom info type to use. Only
            necessary if you want to reverse the deidentification process. Can
            be essentially any arbitrary string, as long as it doesn't appear
            in your dataset otherwise.
        key_name: The name of the Cloud KMS key used to encrypt ('wrap') the
            AES-256 key. Example:
            key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
            keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
        wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key
            should be encrypted using the Cloud KMS key specified by key_name.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    import google.cloud.dlp
    dlp = google.cloud.dlp_v2.DlpServiceClient()
    # Convert the project id into a full resource id.
    parent = f"projects/{project}"
    # The wrapped key is base64-encoded, but the library expects a binary string, so decode it here.
    import base64
    wrapped_key = base64.b64decode(wrapped_key)

    crypto_replace_deterministic_config = {
        "crypto_key": {
            "kms_wrapped": {
                "wrapped_key": wrapped_key,
                "crypto_key_name": key_name
            }
        },
    }

    # surrogate type is the prefix in the Pseudonymized string
    if surrogate_type:
        crypto_replace_deterministic_config["surrogate_info_type"] = {
            "name": surrogate_type
        }

    inspect_config = {
        "info_types": [{
            "name": info_type
        } for info_type in info_types]
    }
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [{
                "primitive_transformation": {
                    "crypto_deterministic_config":
                    crypto_replace_deterministic_config
                }
            }]
        }
    }

    item = {"value": input_str}
    # Call the DLP API https://cloud.google.com/dlp/docs/pseudonymization
    response = dlp.deidentify_content(
        request={
            "parent": parent,
            "deidentify_config": deidentify_config,
            "inspect_config": inspect_config,
            "item": item,
        })

    return response.item.value
def redact_text(data, project):
    info_types = []
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [{
                "primitive_transformation": {
                    "replace_config": {
                        "new_value": {
                            "string_value": '#',
                        }
                    }
                }
            }]
        }
    }

    if data['dlp'] == 'true' or data['dlp'] == 'True':
        dlp = google.cloud.dlp_v2.DlpServiceClient()
        parent = dlp.project_path(project)
        response = dlp.list_info_types('en-US')

        # This will use all info types available, you can narrow it to a list or template
        for info_type in response.info_types:
            info_types.append({'name': info_type.name})

        inspect_config = {"info_types": info_types}

        item = {"value": data['transcript']}
        response = dlp.deidentify_content(
            parent,
            inspect_config=inspect_config,
            deidentify_config=deidentify_config,
            item=item,
        )
        data['transcript'] = response.item.value

        for words_element in data['words']:
            item = {"value": words_element['word']}
            response = dlp.deidentify_content(
                parent,
                inspect_config=inspect_config,
                deidentify_config=deidentify_config,
                item=item,
            )
            words_element['word'] = response.item.value

        for entities_element in data['entities']:
            item = {"value": entities_element['name']}
            response = dlp.deidentify_content(
                parent,
                inspect_config=inspect_config,
                deidentify_config=deidentify_config,
                item=item,
            )
            entities_element['name'] = response.item.value

        for sentences_element in data['sentences']:
            item = {"value": sentences_element['sentence']}
            response = dlp.deidentify_content(
                parent,
                inspect_config=inspect_config,
                deidentify_config=deidentify_config,
                item=item,
            )
            sentences_element['sentence'] = response.item.value

    return data
def deidentify_with_fpe(project, string, info_types, alphabet=None,
                        surrogate_type=None, key_name=None, wrapped_key=None):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string using Format Preserving Encryption (FPE).
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        alphabet: The set of characters to replace sensitive ones with. For
            more information, see https://cloud.google.com/dlp/docs/reference/
            rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
        surrogate_type: The name of the surrogate custom info type to use. Only
            necessary if you want to reverse the deidentification process. Can
            be essentially any arbitrary string, as long as it doesn't appear
            in your dataset otherwise.
        key_name: The name of the Cloud KMS key used to encrypt ('wrap') the
            AES-256 key. Example:
            key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
            keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
        wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key
            should be encrypted using the Cloud KMS key specified by key_name.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # The wrapped key is base64-encoded, but the library expects a binary
    # string, so decode it here.
    import base64
    wrapped_key = base64.b64decode(wrapped_key)

    # Construct FPE configuration dictionary
    crypto_replace_ffx_fpe_config = {
        'crypto_key': {
            'kms_wrapped': {
                'wrapped_key': wrapped_key,
                'crypto_key_name': key_name
            }
        },
        'common_alphabet': alphabet
    }

    # Add surrogate type
    if surrogate_type:
        crypto_replace_ffx_fpe_config['surrogate_info_type'] = {
            'name': surrogate_type
        }

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{'name': info_type} for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [
                {
                    'primitive_transformation': {
                        'crypto_replace_ffx_fpe_config':
                            crypto_replace_ffx_fpe_config
                    }
                }
            ]
        }
    }

    # Convert string to item
    item = {'value': string}

    # Call the API
    response = dlp.deidentify_content(
        parent, inspect_config=inspect_config,
        deidentify_config=deidentify_config, item=item)

    # Print results
    print(response.item.value)
Exemple #23
0
def redact_DocumentTypes(project, filename, output_filename, mime_type=None):
    """Uses the Data Loss Prevention API to redact protected data in an image.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        output_filename: The path to which the redacted image will be written.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    os.environ[
        "GOOGLE_APPLICATION_CREDENTIALS"] = "kubernetes-e9dc8af4883c.json"

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    #  info_types = [{'name': info_type} for info_type in info_types]

    # Prepare image_redaction_configs, a list of dictionaries. Each dictionary
    # contains an info_type and optionally the color used for the replacement.
    # The color is omitted in this sample, so the default (black) will be used.
    image_redaction_configs = []

    # if info_types is not None:
    #     for info_type in info_types:
    #         image_redaction_configs.append({'info_type': info_type})

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    # inspect_config = {
    #     'min_likelihood': min_likelihood,
    #     'info_types': info_types,
    # }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0] or 'application/octet-stream'

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the byte_item, containing the file's byte data.
    with open(filename, mode='rb') as f:
        a = f.read()
        byte_item = {'type': content_type_index, 'data': a}

    print(byte_item)
    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    # response = dlp.redact_image(
    #     parent, inspect_config=inspect_config,
    #     image_redaction_configs=image_redaction_configs,
    #     byte_item=byte_item)

    deidentify_config = {
        "info_type_transformations": {
            "transformations": [{
                "primitive_transformation": {
                    "replace_config": {
                        "new_value": {
                            "string_value": "[XXXXXXXXXXXX]"
                        }
                    }
                }
            }]
        }
    }

    inspect_config = {"info_types": [{"name": "ALL_BASIC"}]}

    item = {"byte_item": byte_item}
    response = dlp.deidentify_content(parent,
                                      inspect_config=inspect_config,
                                      deidentify_config=deidentify_config,
                                      item=item)
    print(response.item.byte_item.data)
    # Write out the results.
    with open(output_filename, mode='wb') as f:
        f.write(response.item.byte_item.data)
    print('Written')