Esempio n. 1
0
def delete_trigger(project, trigger_id):
    """Deletes a Data Loss Prevention API trigger.
    Args:
        project: The id of the Google Cloud project which owns the trigger.
        trigger_id: The id of the trigger to delete.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Combine the trigger id with the parent id.
    trigger_resource = "{}/jobTriggers/{}".format(parent, trigger_id)

    # Call the API.
    dlp.delete_job_trigger(trigger_resource)

    print("Trigger {} successfully deleted.".format(trigger_resource))
Esempio n. 2
0
 def process(self, elem):
     import google.cloud.dlp
     dlp = google.cloud.dlp_v2.DlpServiceClient()
     data = {"header": ["EMAIL"]}
     headers = [{"name": val} for val in data["header"]]
     rows = []
     for event in elem:
         rows.append({"values": [{"string_value": str(event.email)}]})
     table = {}
     table["headers"] = headers
     table["rows"] = rows
     item = {"table": table}
     parent = dlp.project_path(self.project_id)
     deidentify_template_name = "projects/{}/deidentifyTemplates/{}".format(
         self.project_id, self.template_id.get())
     response = dlp.deidentify_content(
         parent,
         deidentify_template_name=deidentify_template_name,
         item=item)
     i = 0
     for event in elem:
         user = User(event.full_name,
                     response.item.table.rows[i].values[0].string_value,
                     event.job, event.city)
         i = i + 1
         yield user.obj_to_Row()
Esempio n. 3
0
def inspect_string(project,
                   content_string,
                   info_types,
                   custom_dictionaries=None,
                   custom_regexes=None,
                   min_likelihood=None,
                   max_findings=None,
                   include_quote=True):

    dlp = google.cloud.dlp.DlpServiceClient()

    parent = dlp.project_path(project)
    item = {'value': content_string}
    info_types = [{'name': info_types}]
    max_findings = 0
    include_quote = True
    min_likelihood = 'LIKELIHOOD_UNSPECIFIED'
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {
            'max_findings_per_request': max_findings
        },
    }

    response = dlp.inspect_content(parent, inspect_config, item)
    return response
Esempio n. 4
0
def test_job_name():
    import google.cloud.dlp

    dlp = google.cloud.dlp_v2.DlpServiceClient()

    parent = dlp.project_path(GCLOUD_PROJECT)

    # Construct job request
    risk_job = {
        "privacy_metric": {
            "categorical_stats_config": {
                "field": {
                    "name": TEST_COLUMN_NAME
                }
            }
        },
        "source_table": {
            "project_id": TEST_TABLE_PROJECT_ID,
            "dataset_id": TEST_DATASET_ID,
            "table_id": TEST_TABLE_ID,
        },
    }

    response = dlp.create_dlp_job(parent, risk_job=risk_job)
    full_path = response.name
    # API expects only job name, not full project path
    job_name = full_path[full_path.rfind("/") + 1:]
    yield job_name

    # clean up job if not deleted
    try:
        dlp.delete_dlp_job(full_path)
    except google.api_core.exceptions.NotFound:
        print("Issue during teardown, missing job")
Esempio n. 5
0
def create_test_job():
    import google.cloud.dlp
    dlp = google.cloud.dlp.DlpServiceClient()

    parent = dlp.project_path(GCLOUD_PROJECT)

    # Construct job request
    risk_job = {
        'privacy_metric': {
            'categorical_stats_config': {
                'field': {
                    'name': TEST_COLUMN_NAME
                }
            }
        },
        'source_table': {
            'project_id': TEST_TABLE_PROJECT_ID,
            'dataset_id': TEST_DATASET_ID,
            'table_id': TEST_TABLE_ID
        }
    }

    response = dlp.create_dlp_job(parent, risk_job=risk_job)
    full_path = response.name
    # API expects only job name, not full project path
    job_name = full_path[full_path.rfind('/')+1:]
    return job_name
Esempio n. 6
0
def list_dlp_jobs(project, filter_string=None, job_type=None):
    """Uses the Data Loss Prevention API to lists DLP jobs that match the
        specified filter in the request.
    Args:
        project: The project id to use as a parent resource.
        filter: (Optional) Allows filtering.
            Supported syntax:
            * Filter expressions are made up of one or more restrictions.
            * Restrictions can be combined by 'AND' or 'OR' logical operators.
            A sequence of restrictions implicitly uses 'AND'.
            * A restriction has the form of '<field> <operator> <value>'.
            * Supported fields/values for inspect jobs:
                - `state` - PENDING|RUNNING|CANCELED|FINISHED|FAILED
                - `inspected_storage` - DATASTORE|CLOUD_STORAGE|BIGQUERY
                - `trigger_name` - The resource name of the trigger that
                                   created job.
            * Supported fields for risk analysis jobs:
                - `state` - RUNNING|CANCELED|FINISHED|FAILED
            * The operator must be '=' or '!='.
            Examples:
            * inspected_storage = cloud_storage AND state = done
            * inspected_storage = cloud_storage OR inspected_storage = bigquery
            * inspected_storage = cloud_storage AND
                                  (state = done OR state = canceled)
        type: (Optional) The type of job. Defaults to 'INSPECT'.
            Choices:
            DLP_JOB_TYPE_UNSPECIFIED
            INSPECT_JOB: The job inspected content for sensitive data.
            RISK_ANALYSIS_JOB: The job executed a Risk Analysis computation.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Job type dictionary
    job_type_to_int = {
        "DLP_JOB_TYPE_UNSPECIFIED":
            google.cloud.dlp.enums.DlpJobType.DLP_JOB_TYPE_UNSPECIFIED,
        "INSPECT_JOB": google.cloud.dlp.enums.DlpJobType.INSPECT_JOB,
        "RISK_ANALYSIS_JOB": google.cloud.dlp.enums.DlpJobType.RISK_ANALYSIS_JOB,
    }
    # If job type is specified, convert job type to number through enums.
    if job_type:
        job_type = job_type_to_int[job_type]

    # Call the API to get a list of jobs.
    response = dlp.list_dlp_jobs(parent, filter_=filter_string, type_=job_type)

    # Iterate over results.
    for job in response:
        print("Job: %s; status: %s" % (job.name, job.JobState.Name(job.state)))
def delete_trigger(project, trigger_id):
    """Deletes a Data Loss Prevention API trigger.
    Args:
        project: The id of the Google Cloud project which owns the trigger.
        trigger_id: The id of the trigger to delete.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Combine the trigger id with the parent id.
    trigger_resource = '{}/jobTriggers/{}'.format(parent, trigger_id)

    # Call the API.
    dlp.delete_job_trigger(trigger_resource)

    print('Trigger {} successfully deleted.'.format(trigger_resource))
def list_triggers(project):
    """Lists all Data Loss Prevention API triggers.
    Args:
        project: The Google Cloud project id to use as a parent resource.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.list_job_triggers(parent)

    # Define a helper function to convert the API's "seconds since the epoch"
    # time format into a human-readable string.
    def human_readable_time(timestamp):
        return str(time.localtime(timestamp.seconds))

    for trigger in response:
        print('Trigger {}:'.format(trigger.name))
        print('  Created: {}'.format(human_readable_time(trigger.create_time)))
        print('  Updated: {}'.format(human_readable_time(trigger.update_time)))
        if trigger.display_name:
            print('  Display Name: {}'.format(trigger.display_name))
        if trigger.description:
            print('  Description: {}'.format(trigger.discription))
        print('  Status: {}'.format(trigger.status))
        print('  Error count: {}'.format(len(trigger.errors)))
Esempio n. 9
0
def delete_inspect_template(project, template_id):
    """Deletes a Data Loss Prevention API template.
    Args:
        project: The id of the Google Cloud project which owns the template.
        template_id: The id of the template to delete.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Combine the template id with the parent id.
    template_resource = '{}/inspectTemplates/{}'.format(parent, template_id)

    # Call the API.
    dlp.delete_inspect_template(template_resource)

    print('Template {} successfully deleted.'.format(template_resource))
Esempio n. 10
0
def create_inspect_template(project,
                            info_types,
                            template_id=None,
                            display_name=None,
                            min_likelihood=None,
                            max_findings=None,
                            include_quote=None):
    """Creates a Data Loss Prevention API inspect template.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        template_id: The id of the template. If omitted, an id will be randomly
            generated.
        display_name: The optional display name of the template.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {
            'max_findings_per_request': max_findings
        },
    }

    inspect_template = {
        'inspect_config': inspect_config,
        'display_name': display_name,
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.create_inspect_template(parent,
                                           inspect_template=inspect_template,
                                           template_id=template_id)

    print('Successfully created template {}'.format(response.name))
def test_job_name():
    import google.cloud.dlp
    dlp = google.cloud.dlp.DlpServiceClient()

    parent = dlp.project_path(GCLOUD_PROJECT)

    # Construct job request
    risk_job = {
        'privacy_metric': {
            'categorical_stats_config': {
                'field': {
                    'name': TEST_COLUMN_NAME
                }
            }
        },
        'source_table': {
            'project_id': TEST_TABLE_PROJECT_ID,
            'dataset_id': TEST_DATASET_ID,
            'table_id': TEST_TABLE_ID
        }
    }

    response = dlp.create_dlp_job(parent, risk_job=risk_job)
    full_path = response.name
    # API expects only job name, not full project path
    job_name = full_path[full_path.rfind('/')+1:]
    return job_name
Esempio n. 12
0
def test_job_name():
    import google.cloud.dlp

    dlp = google.cloud.dlp_v2.DlpServiceClient()

    parent = dlp.project_path(GCLOUD_PROJECT)

    # Construct job request
    risk_job = {
        "privacy_metric": {
            "categorical_stats_config": {
                "field": {
                    "name": TEST_COLUMN_NAME
                }
            }
        },
        "source_table": {
            "project_id": TEST_TABLE_PROJECT_ID,
            "dataset_id": TEST_DATASET_ID,
            "table_id": TEST_TABLE_ID,
        },
    }

    response = dlp.create_dlp_job(parent, risk_job=risk_job)
    full_path = response.name
    # API expects only job name, not full project path
    job_name = full_path[full_path.rfind("/") + 1:]
    return job_name
Esempio n. 13
0
def deidentify_with_replace(
    project,
    input_str,
    info_types,
    replacement_str="REPLACEMENT_STR",
):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by replacing matched input values with a value you specify.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        input_str: The string to deidentify (will be treated as text).
        info_types: A list of strings representing info types to look for.
        replacement_str: The string to replace all values that match given
            info types.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        "info_types": [{
            "name": info_type
        } for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [{
                "primitive_transformation": {
                    "replace_config": {
                        "new_value": {
                            "string_value": replacement_str,
                        }
                    }
                }
            }]
        }
    }

    # Construct item
    item = {"value": input_str}

    # Call the API
    response = dlp.deidentify_content(
        parent,
        inspect_config=inspect_config,
        deidentify_config=deidentify_config,
        item=item,
    )

    # Print out the results.
    print(response.item.value)
Esempio n. 14
0
def list_triggers(project):
    """Lists all Data Loss Prevention API triggers.
    Args:
        project: The Google Cloud project id to use as a parent resource.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.list_job_triggers(parent)

    # Define a helper function to convert the API's "seconds since the epoch"
    # time format into a human-readable string.
    def human_readable_time(timestamp):
        return str(time.localtime(timestamp.seconds))

    for trigger in response:
        print('Trigger {}:'.format(trigger.name))
        print('  Created: {}'.format(human_readable_time(trigger.create_time)))
        print('  Updated: {}'.format(human_readable_time(trigger.update_time)))
        if trigger.display_name:
            print('  Display Name: {}'.format(trigger.display_name))
        if trigger.description:
            print('  Description: {}'.format(trigger.discription))
        print('  Status: {}'.format(trigger.status))
        print('  Error count: {}'.format(len(trigger.errors)))
Esempio n. 15
0
def deidentify_with_mask(project,
                         string,
                         info_types,
                         masking_character=None,
                         number_to_mask=0):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by masking it with a character.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        masking_character: The character to mask matching sensitive data with.
        number_to_mask: The maximum number of sensitive characters to mask in
            a match. If omitted or set to zero, the API will default to no
            maximum.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{
            'name': info_type
        } for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [{
                'primitive_transformation': {
                    'character_mask_config': {
                        'masking_character': masking_character,
                        'number_to_mask': number_to_mask
                    }
                }
            }]
        }
    }

    # Construct item
    item = {'value': string}

    # Call the API
    response = dlp.deidentify_content(parent,
                                      inspect_config=inspect_config,
                                      deidentify_config=deidentify_config,
                                      item=item)

    # Print out the results.
    print(response.item.value)
Esempio n. 16
0
def inspectdata(data, project_id, template_id):
    dlp = google.cloud.dlp_v2.DlpServiceClient()
    parent = dlp.project_path(project_id)
    inspect_template = f"projects/{project_id}/inspectTemplates/{template_id}"
    response = dlp.inspect_content(parent,
                                   inspect_template_name=inspect_template,
                                   item=data)
    return response
Esempio n. 17
0
def inspect_with_medical_record_number_custom_regex_detector(
    project,
    content_string,
):
    """Uses the Data Loss Prevention API to analyze string with medical record
       number custom regex detector
    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct a custom regex detector info type called "C_MRN",
    # with ###-#-##### pattern, where each # represents a digit from 1 to 9.
    # The detector has a detection likelihood of POSSIBLE.
    custom_info_types = [{
        "info_type": {
            "name": "C_MRN"
        },
        "regex": {
            "pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"
        },
        "likelihood": "POSSIBLE",
    }]

    # Construct the configuration dictionary with the custom regex info type.
    inspect_config = {
        "custom_info_types": custom_info_types,
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print(f"Quote: {finding.quote}")
            except AttributeError:
                pass
            print(f"Info type: {finding.info_type.name}")
            print(f"Likelihood: {finding.likelihood}")
    else:
        print("No findings.")
def inspect_string(project, content_string, info_types,
                   min_likelihood=None, max_findings=None, include_quote=True):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {'max_findings_per_request': max_findings},
      }

    # Construct the `item`.
    item = {'value': content_string}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')
Esempio n. 19
0
def deidentify_with_mask(project, string, info_types, masking_character=None,
                         number_to_mask=0):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by masking it with a character.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        masking_character: The character to mask matching sensitive data with.
        number_to_mask: The maximum number of sensitive characters to mask in
            a match. If omitted or set to zero, the API will default to no
            maximum.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{'name': info_type} for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [
                {
                    'primitive_transformation': {
                        'character_mask_config': {
                            'masking_character': masking_character,
                            'number_to_mask': number_to_mask
                        }
                    }
                }
            ]
        }
    }

    # Construct item
    item = {'value': string}

    # Call the API
    response = dlp.deidentify_content(
        parent, inspect_config=inspect_config,
        deidentify_config=deidentify_config, item=item)

    # Print out the results.
    print(response.item.value)
Esempio n. 20
0
def inspect(text):
    dlp = google.cloud.dlp_v2.DlpServiceClient()
    parent = dlp.project_path('roi-gcp-demos')

    inspect_config = {
        "info_types": [
            {
                "name": "EMAIL_ADDRESS"
            },
            {
                "name": "CREDIT_CARD_NUMBER"
            },
            {
                "name": "GENERIC_ID"
            },
            {
                "name": "IP_ADDRESS"
            },
            {
                "name": "PHONE_NUMBER"
            },
            {
                "name": "US_DRIVERS_LICENSE_NUMBER"
            },
            {
                "name": "US_SOCIAL_SECURITY_NUMBER"
            },
        ],
        "include_quote":
        True
    }

    item = {"value": text}

    # Call the API
    response = dlp.inspect_content(
        parent,
        inspect_config=inspect_config,
        item=item,
    )

    result = ""
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    result += "Quote: {}<br>".format(finding.quote)
            except AttributeError:
                pass
            result += "Info type: {}<br>".format(finding.info_type.name)
            result += "Likelihood: {}<br>".format(finding.likelihood)
            result += "<br>"
    else:
        result = "No findings."
    return {"result": result}
Esempio n. 21
0
def create_inspect_template(project, info_types,
                            template_id=None, display_name=None,
                            min_likelihood=None, max_findings=None,
                            include_quote=None):
    """Creates a Data Loss Prevention API inspect template.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        template_id: The id of the template. If omitted, an id will be randomly
            generated.
        display_name: The optional display name of the template.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {'max_findings_per_request': max_findings},
    }

    inspect_template = {
        'inspect_config': inspect_config,
        'display_name': display_name,
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.create_inspect_template(
        parent, inspect_template=inspect_template, template_id=template_id)

    print('Successfully created template {}'.format(response.name))
Esempio n. 22
0
def create_DLP_job(data, done):
    """This function is triggered by new files uploaded to the designated Cloud Storage quarantine/staging bucket.

       It creates a dlp job for the uploaded file.
    Arg:
       data: The Cloud Storage Event
    Returns:
        None. Debug information is printed to the log.
    """
    # Get the targeted file in the quarantine bucket
    file_name = data['name']
    print('Function triggered for file [{}]'.format(file_name))

    # Prepare info_types by converting the list of strings (INFO_TYPES) into a list of dictionaries
    info_types = [{'name': info_type} for info_type in INFO_TYPES]

    # Convert the project id into a full resource id.
    parent = dlp.project_path(PROJECT_ID)

    # Construct the configuration dictionary.
    inspect_job = {
        'inspect_config': {
            'info_types': info_types,
            'min_likelihood': MIN_LIKELIHOOD,
            'limits': {
                'max_findings_per_request': MAX_FINDINGS
            },
        },
        'storage_config': {
            'cloud_storage_options': {
                'file_set': {
                    'url':
                    'gs://{bucket_name}/{file_name}'.format(
                        bucket_name=STAGING_BUCKET, file_name=file_name)
                }
            }
        },
        'actions': [{
            'pub_sub': {
                'topic':
                'projects/{project_id}/topics/{topic_id}'.format(
                    project_id=PROJECT_ID, topic_id=PUB_SUB_TOPIC)
            }
        }]
    }

    # Create the DLP job and let the DLP api processes it.
    try:
        dlp.create_dlp_job(parent, inspect_job)
        print('Job created by create_DLP_job')
    except Exception as e:
        print(e)
 def deindentify(self, data, project_id, template_id):
     headers = [{"name": val} for val in data["header"]]
     rows = []
     for row in data["rows"]:
         rows.append( {"values": [{"string_value": cell_val} for cell_val in row]} )
     table = {}
     table["headers"] = headers
     table["rows"] = rows
     item = {"table": table}
     dlp = google.cloud.dlp_v2.DlpServiceClient()
     parent = dlp.project_path(project_id)
     deidentify_template=f"projects/{project_id}/deidentifyTemplates/{template_id}"
     response = dlp.deidentify_content(parent, deidentify_template_name=deidentify_template,item=item)
     return response
Esempio n. 24
0
def inspect_file(project,
                 filename,
                 info_types,
                 custom_dictionaries=None,
                 custom_regexes=None,
                 min_likelihood=None,
                 max_findings=None,
                 include_quote=True,
                 mime_type=None):

    dlp = google.cloud.dlp.DlpServiceClient()

    # I suppose in theory these could just be templates if
    # I understand the concept correctly.
    info_types = [{'name': info_types}]
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'limits': {
            'max_findings_per_request': max_findings
        },
        'include_quote': include_quote,
    }

    # I guess a mime_type, but I don't know why.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0]

    supported_content_types = {
        None: 0,
    }
    content_type_index = supported_content_types.get(mime_type, 0)
    try:
        headers = {'Authorization': 'Bearer ' + os.environ['TOKEN']}
        r = requests.get(filename, headers=headers, stream=True)

        with open(filename.split('/')[-1], 'wb') as f:
            f.write(r.content)
    except Exception as e:
        return e
    if 'xlsx' in filename.split('/')[-1]:
        return 'I am unable to read this file.  Please try again.'

    with open(filename.split('/')[-1], mode='rb') as f:
        item = {'byte_item': {'type': content_type_index, 'data': f.read()}}

    parent = dlp.project_path(project)
    response = dlp.inspect_content(parent, inspect_config, item)
    return response.result
Esempio n. 25
0
def deidentify_with_replace_infotype(project, item, info_types):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by replacing it with the info type.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        "info_types": [{
            "name": info_type
        } for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [{
                "primitive_transformation": {
                    "replace_with_info_type_config": {}
                }
            }]
        }
    }

    # Call the API
    response = dlp.deidentify_content(
        parent,
        inspect_config=inspect_config,
        deidentify_config=deidentify_config,
        item={"value": item},
    )

    # Print out the results.
    print(response.item.value)
Esempio n. 26
0
def redact_image_all_text(
    project,
    filename,
    output_filename,
):
    """Uses the Data Loss Prevention API to redact all text in an image.

    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        output_filename: The path to which the redacted image will be written.

    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct the image_redaction_configs, indicating to DLP that all text in
    # the input image should be redacted.
    image_redaction_configs = [{
        "redact_all_text": True,
    }]

    # Construct the byte_item, containing the file's byte data.
    with open(filename, mode="rb") as f:
        byte_item = {"type": "IMAGE", "data": f.read()}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.redact_image(
        parent,
        image_redaction_configs=image_redaction_configs,
        byte_item=byte_item,
    )

    # Write out the results.
    with open(output_filename, mode="wb") as f:
        f.write(response.redacted_image)

    print("Wrote {byte_count} to {filename}".format(byte_count=len(
        response.redacted_image),
                                                    filename=output_filename))
Esempio n. 27
0
def deidentify_with_mask(project, string, info_types, masking_character=None,
                         number_to_mask=0):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by masking it with a character.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{'name': info_type} for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [
                {
                    'primitive_transformation': {
                        'character_mask_config': {
                            'masking_character': masking_character,
                            'number_to_mask': number_to_mask
                        }
                    }
                }
            ]
        }
    }

    # Construct item
    item = {'value': string}

    # Call the API
    response = dlp.deidentify_content(
        parent, inspect_config=inspect_config,
        deidentify_config=deidentify_config, item=item)

    return response.item.value
Esempio n. 28
0
def deidentify_with_mask(data,done):

    # Convert the project id into a full resource id.
    parent = dlp.project_path(PROJECT_ID)

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{'name': info_type} for info_type in INFO_TYPES]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [
                {
                    'primitive_transformation': {
                        'character_mask_config': {
                            'masking_character': 'X',
                            'number_to_mask': 0
                        }
                    }
                }
            ]
        }
    }


    storage_client = storage.Client()
    bucket = storage_client.get_bucket(SENSITIVE_BUCKET)

    blobs = bucket.list_blobs()

    for blob in blobs:
        gcs_file = blob.download_as_string()
        #contents = gcs_file.readline()
        item = {'value': gcs_file}
         # Call the API
        response = dlp.deidentify_content(
        parent, inspect_config=inspect_config,
        deidentify_config=deidentify_config, item=item)    
        masked_item = response.item.value
        destination_bucket = storage_client.get_bucket(MASKED_BUCKET)
        masked_blob = Blob(blob.name,destination_bucket)
        masked_blob.upload_from_string(masked_item)
Esempio n. 29
0
def list_inspect_templates(project):
    """Lists all Data Loss Prevention API inspect templates.
    Args:
        project: The Google Cloud project id to use as a parent resource.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.list_inspect_templates(parent)

    # Define a helper function to convert the API's "seconds since the epoch"
    # time format into a human-readable string.
    def human_readable_time(timestamp):
        return str(time.localtime(timestamp.seconds))

    for template in response:
        print('Template {}:'.format(template.name))
        if template.display_name:
            print('  Display Name: {}'.format(template.display_name))
        print('  Created: {}'.format(
            human_readable_time(template.create_time)))
        print('  Updated: {}'.format(
            human_readable_time(template.update_time)))

        config = template.inspect_config
        print('  InfoTypes: {}'.format(', '.join(
            [it.name for it in config.info_types]
        )))
        print('  Minimum likelihood: {}'.format(config.min_likelihood))
        print('  Include quotes: {}'.format(config.include_quote))
        print('  Max findings per request: {}'.format(
            config.limits.max_findings_per_request))
Esempio n. 30
0
def create_stored_info_type(project,
                            dictionary_config,
                            stored_info_type_id=None,
                            display_name=None,
                            description=None):
    """Creates a scheduled Data Loss Prevention API stored infoType from a
        column of a BigQuery.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        dictionary_config: The config for the large custom dictionary.
        stored_info_type_id: The id of the stored infoType. If omitted, an id
            will be randomly generated.
        display_name: The optional display name of the stored infoType.
        description: The optional description of the stored infoType.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Create the stored infoType config.
    stored_info_type_config = {
        'display_name': display_name,
        'description': description,
        'large_custom_dictionary': dictionary_config
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.create_stored_info_type(
        parent,
        config=stored_info_type_config,
        stored_info_type_id=stored_info_type_id)

    print('Successfully created stored infoType {}'.format(response.name))
Esempio n. 31
0
def list_inspect_templates(project):
    """Lists all Data Loss Prevention API inspect templates.
    Args:
        project: The Google Cloud project id to use as a parent resource.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.list_inspect_templates(parent)

    # Define a helper function to convert the API's "seconds since the epoch"
    # time format into a human-readable string.
    def human_readable_time(timestamp):
        return str(time.localtime(timestamp.seconds))

    for template in response:
        print('Template {}:'.format(template.name))
        if template.display_name:
            print('  Display Name: {}'.format(template.display_name))
        print('  Created: {}'.format(human_readable_time(
            template.create_time)))
        print('  Updated: {}'.format(human_readable_time(
            template.update_time)))

        config = template.inspect_config
        print('  InfoTypes: {}'.format(', '.join(
            [it.name for it in config.info_types])))
        print('  Minimum likelihood: {}'.format(config.min_likelihood))
        print('  Include quotes: {}'.format(config.include_quote))
        print('  Max findings per request: {}'.format(
            config.limits.max_findings_per_request))
Esempio n. 32
0
def list_stored_info_types(project):
    """Lists all Data Loss Prevention API stored infoTypes.
    Args:
        project: The Google Cloud project id to use as a parent resource.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.list_stored_info_types(parent)

    # Define a helper function to convert the API's "seconds since the epoch"
    # time format into a human-readable string.
    def human_readable_time(timestamp):
        return str(time.localtime(timestamp.seconds))

    for stored_info_type in response:
        print('Stored infoType {}:'.format(stored_info_type.name))
        if stored_info_type.current_version:
            version = stored_info_type.current_version
            print('  Current version:')
            print('    Created: {}'.format(
                human_readable_time(version.create_time)))
            print('    State: {}'.format(version.state))
            print('    Error count: {}'.format(len(version.errors)))
        if stored_info_type.pending_versions:
            print('  Pending versions:')
            for version in stored_info_type.pending_versions:
                print('    Created: {}'.format(
                    human_readable_time(version.create_time)))
                print('    State: {}'.format(version.state))
                print('    Error count: {}'.format(len(version.errors)))
Esempio n. 33
0
def redact_text(data, project):
    logging.info(data)
    dlp = google.cloud.dlp_v2.DlpServiceClient()
    parent = dlp.project_path(project)
    response = dlp.list_info_types('en-US')

    # This will detect PII data for the info types listed
    # https://cloud.google.com/dlp/docs/infotypes-reference
    info_types = [
        "PERSON_NAME", "PHONE_NUMBER", "ORGANIZATION_NAME", "FIRST_NAME",
        "LAST_NAME", "EMAIL_ADDRESS", "DATE_OF_BIRTH", "EMAIL_ADDRESS",
        "US_SOCIAL_SECURITY_NUMBER", "STREET_ADDRESS"
    ]

    info_types = [{"name": info_type} for info_type in info_types]

    inspect_config = {"info_types": info_types, "include_quote": True}
    logging.info(data['transcript'])
    item = {"value": data['transcript']}
    response = dlp.inspect_content(
        parent,
        inspect_config=inspect_config,
        item=item,
    )
    logging.info(response)
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print("Quote: {}".format(finding.quote))
                    data['dlp'].append(finding.quote)
            except AttributeError:
                pass
        else:
            print("No findings.")
    return data
Esempio n. 34
0
def deidentify_with_fpe(project, string, info_types, alphabet=None,
                        surrogate_type=None, key_name=None, wrapped_key=None):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string using Format Preserving Encryption (FPE).
    Args:
        project: The Google Cloud project id to use as a parent resource.
        item: The string to deidentify (will be treated as text).
        alphabet: The set of characters to replace sensitive ones with. For
            more information, see https://cloud.google.com/dlp/docs/reference/
            rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
        surrogate_type: The name of the surrogate custom info type to use. Only
            necessary if you want to reverse the deidentification process. Can
            be essentially any arbitrary string, as long as it doesn't appear
            in your dataset otherwise.
        key_name: The name of the Cloud KMS key used to encrypt ('wrap') the
            AES-256 key. Example:
            key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
            keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
        wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key
            should be encrypted using the Cloud KMS key specified by key_name.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # The wrapped key is base64-encoded, but the library expects a binary
    # string, so decode it here.
    import base64
    wrapped_key = base64.b64decode(wrapped_key)

    # Construct FPE configuration dictionary
    crypto_replace_ffx_fpe_config = {
        'crypto_key': {
            'kms_wrapped': {
                'wrapped_key': wrapped_key,
                'crypto_key_name': key_name
            }
        },
        'common_alphabet': alphabet
    }

    # Add surrogate type
    if surrogate_type:
        crypto_replace_ffx_fpe_config['surrogate_info_type'] = {
            'name': surrogate_type
        }

    # Construct inspect configuration dictionary
    inspect_config = {
        'info_types': [{'name': info_type} for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [
                {
                    'primitive_transformation': {
                        'crypto_replace_ffx_fpe_config':
                            crypto_replace_ffx_fpe_config
                    }
                }
            ]
        }
    }

    # Convert string to item
    item = {'value': string}

    # Call the API
    response = dlp.deidentify_content(
        parent, inspect_config=inspect_config,
        deidentify_config=deidentify_config, item=item)

    # Print results
    print(response.item.value)
def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
                     topic_id, subscription_id, info_types,
                     custom_dictionaries=None, custom_regexes=None,
                     min_likelihood=None, max_findings=None, timeout=300):
    """Uses the Data Loss Prevention API to analyze BigQuery data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        bigquery_project: The Google Cloud project id of the target table.
        dataset_id: The id of the target BigQuery dataset.
        table_id: The id of the target BigQuery table.
        topic_id: The id of the Cloud Pub/Sub topic to which the API will
            broadcast job completion. The topic must already exist.
        subscription_id: The id of the Cloud Pub/Sub subscription to listen on
            while waiting for job completion. The subscription must already
            exist and be subscribed to the topic.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        namespace_id: The namespace of the Datastore document, if applicable.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        timeout: The number of seconds to wait for a response from the API.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
    info_types = [{'name': info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [{
        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
        'dictionary': {
            'word_list': {'words': custom_dict.split(',')}
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
        'regex': {'pattern': custom_regex}
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'limits': {'max_findings_per_request': max_findings},
    }

    # Construct a storage_config containing the target Bigquery info.
    storage_config = {
        'big_query_options': {
            'table_reference': {
                'project_id': bigquery_project,
                'dataset_id': dataset_id,
                'table_id': table_id,
            }
        }
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Construct the inspect_job, which defines the entire inspect content task.
    inspect_job = {
        'inspect_config': inspect_config,
        'storage_config': storage_config,
        'actions': actions,
    }

    operation = dlp.create_dlp_job(parent, inspect_job=inspect_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    def callback(message):
        try:
            if (message.attributes['DlpJobName'] == operation.name):
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                if job.inspect_details.result.info_type_stats:
                    for finding in job.inspect_details.result.info_type_stats:
                        print('Info type: {}; Count: {}'.format(
                            finding.info_type.name, finding.count))
                else:
                    print('No findings.')

                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscriber.subscribe(subscription_path, callback=callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
Esempio n. 36
0
def redact_image(project, filename, output_filename,
                 info_types, min_likelihood=None, mime_type=None):
    """Uses the Data Loss Prevention API to redact protected data in an image.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        output_filename: The path to which the redacted image will be written.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Prepare image_redaction_configs, a list of dictionaries. Each dictionary
    # contains an info_type and optionally the color used for the replacement.
    # The color is omitted in this sample, so the default (black) will be used.
    image_redaction_configs = []

    if info_types is not None:
        for info_type in info_types:
            image_redaction_configs.append({'info_type': info_type})

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'min_likelihood': min_likelihood,
        'info_types': info_types,
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0] or 'application/octet-stream'

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the byte_item, containing the file's byte data.
    with open(filename, mode='rb') as f:
        byte_item = {'type': content_type_index, 'data': f.read()}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.redact_image(
        parent, inspect_config=inspect_config,
        image_redaction_configs=image_redaction_configs,
        byte_item=byte_item)

    # Write out the results.
    with open(output_filename, mode='wb') as f:
        f.write(response.redacted_image)
    print("Wrote {byte_count} to {filename}".format(
        byte_count=len(response.redacted_image), filename=output_filename))
Esempio n. 37
0
def deidentify_with_date_shift(project, input_csv_file=None,
                               output_csv_file=None, date_fields=None,
                               lower_bound_days=None, upper_bound_days=None,
                               context_field_id=None, wrapped_key=None,
                               key_name=None):
    """Uses the Data Loss Prevention API to deidentify dates in a CSV file by
        pseudorandomly shifting them.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        input_csv_file: The path to the CSV file to deidentify. The first row
            of the file must specify column names, and all other rows must
            contain valid values.
        output_csv_file: The path to save the date-shifted CSV file.
        date_fields: The list of (date) fields in the CSV file to date shift.
            Example: ['birth_date', 'register_date']
        lower_bound_days: The maximum number of days to shift a date backward
        upper_bound_days: The maximum number of days to shift a date forward
        context_field_id: (Optional) The column to determine date shift amount
            based on. If this is not specified, a random shift amount will be
            used for every row. If this is specified, then 'wrappedKey' and
            'keyName' must also be set. Example:
            contextFieldId = [{ 'name': 'user_id' }]
        key_name: (Optional) The name of the Cloud KMS key used to encrypt
            ('wrap') the AES-256 key. Example:
            key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
            keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
        wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use.
            This key should be encrypted using the Cloud KMS key specified by
            key_name.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Convert date field list to Protobuf type
    def map_fields(field):
        return {'name': field}

    if date_fields:
        date_fields = map(map_fields, date_fields)
    else:
        date_fields = []

    # Read and parse the CSV file
    import csv
    from datetime import datetime
    f = []
    with open(input_csv_file, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            f.append(row)

    #  Helper function for converting CSV rows to Protobuf types
    def map_headers(header):
        return {'name': header}

    def map_data(value):
        try:
            date = datetime.strptime(value, '%m/%d/%Y')
            return {
                'date_value': {
                    'year': date.year,
                    'month': date.month,
                    'day': date.day
                }
            }
        except ValueError:
            return {'string_value': value}

    def map_rows(row):
        return {'values': map(map_data, row)}

    # Using the helper functions, convert CSV rows to protobuf-compatible
    # dictionaries.
    csv_headers = map(map_headers, f[0])
    csv_rows = map(map_rows, f[1:])

    # Construct the table dict
    table_item = {
        'table': {
            'headers': csv_headers,
            'rows': csv_rows
        }
    }

    # Construct date shift config
    date_shift_config = {
        'lower_bound_days': lower_bound_days,
        'upper_bound_days': upper_bound_days
    }

    # If using a Cloud KMS key, add it to the date_shift_config.
    # The wrapped key is base64-encoded, but the library expects a binary
    # string, so decode it here.
    if context_field_id and key_name and wrapped_key:
        import base64
        date_shift_config['context'] = {'name': context_field_id}
        date_shift_config['crypto_key'] = {
            'kms_wrapped': {
                'wrapped_key': base64.b64decode(wrapped_key),
                'crypto_key_name': key_name
            }
        }
    elif context_field_id or key_name or wrapped_key:
        raise ValueError("""You must set either ALL or NONE of
        [context_field_id, key_name, wrapped_key]!""")

    # Construct Deidentify Config
    deidentify_config = {
        'record_transformations': {
            'field_transformations': [
                {
                    'fields': date_fields,
                    'primitive_transformation': {
                        'date_shift_config': date_shift_config
                    }
                }
            ]
        }
    }

    # Write to CSV helper methods
    def write_header(header):
        return header.name

    def write_data(data):
        return data.string_value or '%s/%s/%s' % (data.date_value.month,
                                                  data.date_value.day,
                                                  data.date_value.year)

    # Call the API
    response = dlp.deidentify_content(
        parent, deidentify_config=deidentify_config, item=table_item)

    # Write results to CSV file
    with open(output_csv_file, 'w') as csvfile:
        write_file = csv.writer(csvfile, delimiter=',')
        write_file.writerow(map(write_header, response.item.table.headers))
        for row in response.item.table.rows:
            write_file.writerow(map(write_data, row.values))
    # Print status
    print('Successfully saved date-shift output to {}'.format(
                output_csv_file))
Esempio n. 38
0
def list_dlp_jobs(project, filter_string=None, job_type=None):
    """Uses the Data Loss Prevention API to lists DLP jobs that match the
        specified filter in the request.
    Args:
        project: The project id to use as a parent resource.
        filter: (Optional) Allows filtering.
            Supported syntax:
            * Filter expressions are made up of one or more restrictions.
            * Restrictions can be combined by 'AND' or 'OR' logical operators.
            A sequence of restrictions implicitly uses 'AND'.
            * A restriction has the form of '<field> <operator> <value>'.
            * Supported fields/values for inspect jobs:
                - `state` - PENDING|RUNNING|CANCELED|FINISHED|FAILED
                - `inspected_storage` - DATASTORE|CLOUD_STORAGE|BIGQUERY
                - `trigger_name` - The resource name of the trigger that
                                   created job.
            * Supported fields for risk analysis jobs:
                - `state` - RUNNING|CANCELED|FINISHED|FAILED
            * The operator must be '=' or '!='.
            Examples:
            * inspected_storage = cloud_storage AND state = done
            * inspected_storage = cloud_storage OR inspected_storage = bigquery
            * inspected_storage = cloud_storage AND
                                  (state = done OR state = canceled)
        type: (Optional) The type of job. Defaults to 'INSPECT'.
            Choices:
            DLP_JOB_TYPE_UNSPECIFIED
            INSPECT_JOB: The job inspected content for sensitive data.
            RISK_ANALYSIS_JOB: The job executed a Risk Analysis computation.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Job type dictionary
    job_type_to_int = {
        'DLP_JOB_TYPE_UNSPECIFIED':
            google.cloud.dlp.enums.DlpJobType.DLP_JOB_TYPE_UNSPECIFIED,
        'INSPECT_JOB': google.cloud.dlp.enums.DlpJobType.INSPECT_JOB,
        'RISK_ANALYSIS_JOB':
            google.cloud.dlp.enums.DlpJobType.RISK_ANALYSIS_JOB
    }
    # If job type is specified, convert job type to number through enums.
    if job_type:
        job_type = job_type_to_int[job_type]

    # Call the API to get a list of jobs.
    response = dlp.list_dlp_jobs(
        parent,
        filter_=filter_string,
        type_=job_type)

    # Iterate over results.
    for job in response:
        print('Job: %s; status: %s' % (job.name, job.JobState.Name(job.state)))
Esempio n. 39
0
def numerical_risk_analysis(project, table_project_id, dataset_id, table_id,
                            column_name, topic_id, subscription_id,
                            timeout=300):
    """Uses the Data Loss Prevention API to compute risk metrics of a column
       of numerical data in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        column_name: The name of the column to compute risk metrics for.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        'project_id': table_project_id,
        'dataset_id': dataset_id,
        'table_id': table_id
    }

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        'privacy_metric': {
            'numerical_stats_config': {
                'field': {
                    'name': column_name
                }
            }
        },
        'source_table': source_table,
        'actions': actions
    }

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)
    subscription = subscriber.subscribe(subscription_path)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    def callback(message):
        try:
            if (message.attributes['DlpJobName'] == operation.name):
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                results = job.risk_details.numerical_stats_result
                print('Value Range: [{}, {}]'.format(
                    results.min_value.integer_value,
                    results.max_value.integer_value))
                prev_value = None
                for percent, result in enumerate(results.quantile_values):
                    value = result.integer_value
                    if prev_value != value:
                        print('Value at {}% quantile: {}'.format(
                              percent, value))
                        prev_value = value
                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscription.open(callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
Esempio n. 40
0
def inspect_file(
    project,
    filename,
    info_types,
    min_likelihood=None,
    custom_dictionaries=None,
    custom_regexes=None,
    max_findings=None,
    include_quote=True,
    mime_type=None,
):
    """Uses the Data Loss Prevention API to analyze a file for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    import mimetypes

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"]
    info_types = [{"name": info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [
        {
            "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
            "dictionary": {"word_list": {"words": custom_dict.split(",")}},
        }
        for i, custom_dict in enumerate(custom_dictionaries)
    ]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [
        {
            "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
            "regex": {"pattern": custom_regex},
        }
        for i, custom_regex in enumerate(custom_regexes)
    ]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": info_types,
        "custom_info_types": custom_info_types,
        "min_likelihood": min_likelihood,
        "limits": {"max_findings_per_request": max_findings},
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0]

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        "image/jpeg": 1,
        "image/bmp": 2,
        "image/png": 3,
        "image/svg": 4,
        "text/plain": 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the item, containing the file's byte data.
    with open(filename, mode="rb") as f:
        item = {"byte_item": {"type": content_type_index, "data": f.read()}}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                print("Quote: {}".format(finding.quote))
            except AttributeError:
                pass
            print("Info type: {}".format(finding.info_type.name))
            print("Likelihood: {}".format(finding.likelihood))
    else:
        print("No findings.")
Esempio n. 41
0
def l_diversity_analysis(project, table_project_id, dataset_id, table_id,
                         topic_id, subscription_id, sensitive_attribute,
                         quasi_ids, timeout=300):
    """Uses the Data Loss Prevention API to compute the l-diversity of a
        column set in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        sensitive_attribute: The column to measure l-diversity relative to.
        quasi_ids: A set of columns that form a composite key.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        'project_id': table_project_id,
        'dataset_id': dataset_id,
        'table_id': table_id
    }

    # Convert quasi id list to Protobuf type
    def map_fields(field):
        return {'name': field}

    quasi_ids = map(map_fields, quasi_ids)

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        'privacy_metric': {
            'l_diversity_config': {
                'quasi_ids': quasi_ids,
                'sensitive_attribute': {
                    'name': sensitive_attribute
                }
            }
        },
        'source_table': source_table,
        'actions': actions
    }

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)
    subscription = subscriber.subscribe(subscription_path)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    # Create helper function for unpacking values
    def get_values(obj):
        return int(obj.integer_value)

    def callback(message):
        try:
            if (message.attributes['DlpJobName'] == operation.name):
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                histogram_buckets = (
                    job.risk_details
                       .l_diversity_result
                       .sensitive_value_frequency_histogram_buckets)
                # Print bucket stats
                for i, bucket in enumerate(histogram_buckets):
                    print('Bucket {}:'.format(i))
                    print('   Bucket size range: [{}, {}]'.format(
                        bucket.sensitive_value_frequency_lower_bound,
                        bucket.sensitive_value_frequency_upper_bound))
                    for value_bucket in bucket.bucket_values:
                        print('   Quasi-ID values: {}'.format(
                            map(get_values, value_bucket.quasi_ids_values)))
                        print('   Class size: {}'.format(
                            value_bucket.equivalence_class_size))
                        for value in value_bucket.top_sensitive_values:
                            print(('   Sensitive value {} occurs {} time(s)'
                                   .format(value.value, value.count)))
                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscription.open(callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
Esempio n. 42
0
def inspect_table(
    project,
    data,
    info_types,
    custom_dictionaries=None,
    custom_regexes=None,
    min_likelihood=None,
    max_findings=None,
    include_quote=True,
):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        data: Json string representing table data.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    Example:
        data = {
            "header":[
                "email",
                "phone number"
            ],
            "rows":[
                [
                    "*****@*****.**",
                    "4232342345"
                ],
                [
                    "*****@*****.**",
                    "4253458383"
                ]
            ]
        }

        >> $ python inspect_content.py table \
        '{"header": ["email", "phone number"],
        "rows": [["*****@*****.**", "4232342345"],
        ["*****@*****.**", "4253458383"]]}'
        >>  Quote: [email protected]
            Info type: EMAIL_ADDRESS
            Likelihood: 4
            Quote: [email protected]
            Info type: EMAIL_ADDRESS
            Likelihood: 4
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{"name": info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [
        {
            "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
            "dictionary": {"word_list": {"words": custom_dict.split(",")}},
        }
        for i, custom_dict in enumerate(custom_dictionaries)
    ]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [
        {
            "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
            "regex": {"pattern": custom_regex},
        }
        for i, custom_regex in enumerate(custom_regexes)
    ]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": info_types,
        "custom_info_types": custom_info_types,
        "min_likelihood": min_likelihood,
        "include_quote": include_quote,
        "limits": {"max_findings_per_request": max_findings},
    }

    # Construct the `table`. For more details on the table schema, please see
    # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
    headers = [{"name": val} for val in data["header"]]
    rows = []
    for row in data["rows"]:
        rows.append({"values": [{"string_value": cell_val} for cell_val in row]})

    table = {}
    table["headers"] = headers
    table["rows"] = rows
    item = {"table": table}
    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print("Quote: {}".format(finding.quote))
            except AttributeError:
                pass
            print("Info type: {}".format(finding.info_type.name))
            print("Likelihood: {}".format(finding.likelihood))
    else:
        print("No findings.")
def inspect_file(project, filename, info_types, min_likelihood=None,
                 custom_dictionaries=None, custom_regexes=None,
                 max_findings=None, include_quote=True, mime_type=None):
    """Uses the Data Loss Prevention API to analyze a file for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    import mimetypes

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
    info_types = [{'name': info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [{
        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
        'dictionary': {
            'word_list': {'words': custom_dict.split(',')}
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
        'regex': {'pattern': custom_regex}
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'limits': {'max_findings_per_request': max_findings},
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0]

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the item, containing the file's byte data.
    with open(filename, mode='rb') as f:
        item = {'byte_item': {'type': content_type_index, 'data': f.read()}}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')
def inspect_table(project, data, info_types,
                  custom_dictionaries=None, custom_regexes=None,
                  min_likelihood=None, max_findings=None, include_quote=True):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        data: Json string representing table data.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    Example:
        data = {
            "header":[
                "email",
                "phone number"
            ],
            "rows":[
                [
                    "*****@*****.**",
                    "4232342345"
                ],
                [
                    "*****@*****.**",
                    "4253458383"
                ]
            ]
        }

        >> $ python inspect_content.py table \
        '{"header": ["email", "phone number"],
        "rows": [["*****@*****.**", "4232342345"],
        ["*****@*****.**", "4253458383"]]}'
        >>  Quote: [email protected]
            Info type: EMAIL_ADDRESS
            Likelihood: 4
            Quote: [email protected]
            Info type: EMAIL_ADDRESS
            Likelihood: 4
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [{
        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
        'dictionary': {
            'word_list': {'words': custom_dict.split(',')}
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
        'regex': {'pattern': custom_regex}
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {'max_findings_per_request': max_findings},
    }

    # Construct the `table`. For more details on the table schema, please see
    # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
    headers = [{"name": val} for val in data["header"]]
    rows = []
    for row in data["rows"]:
        rows.append({
            "values": [{"string_value": cell_val} for cell_val in row]
        })

    table = {}
    table["headers"] = headers
    table["rows"] = rows
    item = {"table": table}
    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')
def create_trigger(project, bucket, scan_period_days, info_types,
                   trigger_id=None, display_name=None, description=None,
                   min_likelihood=None, max_findings=None,
                   auto_populate_timespan=False):
    """Creates a scheduled Data Loss Prevention API inspect_content trigger.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        bucket: The name of the GCS bucket to scan. This sample scans all
            files in the bucket using a wildcard.
        scan_period_days: How often to repeat the scan, in days.
            The minimum is 1 day.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        trigger_id: The id of the trigger. If omitted, an id will be randomly
            generated.
        display_name: The optional display name of the trigger.
        description: The optional description of the trigger.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        auto_populate_timespan: Automatically populates time span config start
            and end times in order to scan new content only.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'limits': {'max_findings_per_request': max_findings},
    }

    # Construct a cloud_storage_options dictionary with the bucket's URL.
    url = 'gs://{}/*'.format(bucket)
    storage_config = {
        'cloud_storage_options': {
            'file_set': {'url': url}
        },
        # Time-based configuration for each storage object.
        'timespan_config': {
            # Auto-populate start and end times in order to scan new objects
            # only.
            'enable_auto_population_of_timespan_config': auto_populate_timespan
        },
    }

    # Construct the job definition.
    job = {
        'inspect_config': inspect_config,
        'storage_config': storage_config,
    }

    # Construct the schedule definition:
    schedule = {
        'recurrence_period_duration': {
            'seconds': scan_period_days * 60 * 60 * 24,
        }
    }

    # Construct the trigger definition.
    job_trigger = {
        'inspect_job': job,
        'display_name': display_name,
        'description': description,
        'triggers': [
            {'schedule': schedule}
        ],
        'status': 'HEALTHY'
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.create_job_trigger(
        parent, job_trigger=job_trigger, trigger_id=trigger_id)

    print('Successfully created trigger {}'.format(response.name))
def categorical_risk_analysis(project, table_project_id, dataset_id, table_id,
                              column_name, topic_id, subscription_id,
                              timeout=300):
    """Uses the Data Loss Prevention API to compute risk metrics of a column
       of categorical data in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        column_name: The name of the column to compute risk metrics for.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    def callback(message):
        if (message.attributes['DlpJobName'] == operation.name):
            # This is the message we're looking for, so acknowledge it.
            message.ack()

            # Now that the job is done, fetch the results and print them.
            job = dlp.get_dlp_job(operation.name)
            histogram_buckets = (job.risk_details
                                    .categorical_stats_result
                                    .value_frequency_histogram_buckets)
            # Print bucket stats
            for i, bucket in enumerate(histogram_buckets):
                print('Bucket {}:'.format(i))
                print('   Most common value occurs {} time(s)'.format(
                    bucket.value_frequency_upper_bound))
                print('   Least common value occurs {} time(s)'.format(
                    bucket.value_frequency_lower_bound))
                print('   {} unique values total.'.format(
                    bucket.bucket_size))
                for value in bucket.bucket_values:
                    print('   Value {} occurs {} time(s)'.format(
                        value.value.integer_value, value.count))
            subscription.set_result(None)
        else:
            # This is not the message we're looking for.
            message.drop()

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        'project_id': table_project_id,
        'dataset_id': dataset_id,
        'table_id': table_id
    }

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        'privacy_metric': {
            'categorical_stats_config': {
                'field': {
                    'name': column_name
                }
            }
        },
        'source_table': source_table,
        'actions': actions
    }

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)
    subscription = subscriber.subscribe(subscription_path, callback)

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    try:
        subscription.result(timeout=timeout)
    except TimeoutError:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
        subscription.close()
Esempio n. 47
0
def inspect_string(
    project,
    content_string,
    info_types,
    custom_dictionaries=None,
    custom_regexes=None,
    min_likelihood=None,
    max_findings=None,
    include_quote=True,
):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{"name": info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [
        {
            "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
            "dictionary": {"word_list": {"words": custom_dict.split(",")}},
        }
        for i, custom_dict in enumerate(custom_dictionaries)
    ]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [
        {
            "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
            "regex": {"pattern": custom_regex},
        }
        for i, custom_regex in enumerate(custom_regexes)
    ]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": info_types,
        "custom_info_types": custom_info_types,
        "min_likelihood": min_likelihood,
        "include_quote": include_quote,
        "limits": {"max_findings_per_request": max_findings},
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print("Quote: {}".format(finding.quote))
            except AttributeError:
                pass
            print("Info type: {}".format(finding.info_type.name))
            print("Likelihood: {}".format(finding.likelihood))
    else:
        print("No findings.")
def k_anonymity_analysis(project, table_project_id, dataset_id, table_id,
                         topic_id, subscription_id, quasi_ids, timeout=300):
    """Uses the Data Loss Prevention API to compute the k-anonymity of a
        column set in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        quasi_ids: A set of columns that form a composite key.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # Create helper function for unpacking values
    def get_values(obj):
        return int(obj.integer_value)

    def callback(message):
        if (message.attributes['DlpJobName'] == operation.name):
            # This is the message we're looking for, so acknowledge it.
            message.ack()

            # Now that the job is done, fetch the results and print them.
            job = dlp.get_dlp_job(operation.name)
            histogram_buckets = (job.risk_details
                                    .k_anonymity_result
                                    .equivalence_class_histogram_buckets)
            # Print bucket stats
            for i, bucket in enumerate(histogram_buckets):
                print('Bucket {}:'.format(i))
                if bucket.equivalence_class_size_lower_bound:
                    print('   Bucket size range: [{}, {}]'.format(
                        bucket.equivalence_class_size_lower_bound,
                        bucket.equivalence_class_size_upper_bound))
                    for value_bucket in bucket.bucket_values:
                        print('   Quasi-ID values: {}'.format(
                            map(get_values, value_bucket.quasi_ids_values)
                        ))
                        print('   Class size: {}'.format(
                            value_bucket.equivalence_class_size))
            subscription.set_result(None)
        else:
            # This is not the message we're looking for.
            message.drop()

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        'project_id': table_project_id,
        'dataset_id': dataset_id,
        'table_id': table_id
    }

    # Convert quasi id list to Protobuf type
    def map_fields(field):
        return {'name': field}

    quasi_ids = map(map_fields, quasi_ids)

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        'privacy_metric': {
            'k_anonymity_config': {
                'quasi_ids': quasi_ids
            }
        },
        'source_table': source_table,
        'actions': actions
    }

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)
    subscription = subscriber.subscribe(subscription_path, callback)

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    try:
        subscription.result(timeout=timeout)
    except TimeoutError:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
        subscription.close()
Esempio n. 49
0
def inspect_bigquery(
    project,
    bigquery_project,
    dataset_id,
    table_id,
    topic_id,
    subscription_id,
    info_types,
    custom_dictionaries=None,
    custom_regexes=None,
    min_likelihood=None,
    max_findings=None,
    timeout=300,
):
    """Uses the Data Loss Prevention API to analyze BigQuery data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        bigquery_project: The Google Cloud project id of the target table.
        dataset_id: The id of the target BigQuery dataset.
        table_id: The id of the target BigQuery table.
        topic_id: The id of the Cloud Pub/Sub topic to which the API will
            broadcast job completion. The topic must already exist.
        subscription_id: The id of the Cloud Pub/Sub subscription to listen on
            while waiting for job completion. The subscription must already
            exist and be subscribed to the topic.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        namespace_id: The namespace of the Datastore document, if applicable.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        timeout: The number of seconds to wait for a response from the API.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"]
    info_types = [{"name": info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [
        {
            "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
            "dictionary": {"word_list": {"words": custom_dict.split(",")}},
        }
        for i, custom_dict in enumerate(custom_dictionaries)
    ]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [
        {
            "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
            "regex": {"pattern": custom_regex},
        }
        for i, custom_regex in enumerate(custom_regexes)
    ]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": info_types,
        "custom_info_types": custom_info_types,
        "min_likelihood": min_likelihood,
        "limits": {"max_findings_per_request": max_findings},
    }

    # Construct a storage_config containing the target Bigquery info.
    storage_config = {
        "big_query_options": {
            "table_reference": {
                "project_id": bigquery_project,
                "dataset_id": dataset_id,
                "table_id": table_id,
            }
        }
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Tell the API where to send a notification when the job is complete.
    actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}]

    # Construct the inspect_job, which defines the entire inspect content task.
    inspect_job = {
        "inspect_config": inspect_config,
        "storage_config": storage_config,
        "actions": actions,
    }

    operation = dlp.create_dlp_job(parent, inspect_job=inspect_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(project, subscription_id)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    def callback(message):
        try:
            if message.attributes["DlpJobName"] == operation.name:
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                if job.inspect_details.result.info_type_stats:
                    for finding in job.inspect_details.result.info_type_stats:
                        print(
                            "Info type: {}; Count: {}".format(
                                finding.info_type.name, finding.count
                            )
                        )
                else:
                    print("No findings.")

                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscriber.subscribe(subscription_path, callback=callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print(
            "No event received before the timeout. Please verify that the "
            "subscription provided is subscribed to the topic provided."
        )
Esempio n. 50
0
def k_map_estimate_analysis(project, table_project_id, dataset_id, table_id,
                            topic_id, subscription_id, quasi_ids, info_types,
                            region_code='US', timeout=300):
    """Uses the Data Loss Prevention API to compute the k-map risk estimation
        of a column set in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        column_name: The name of the column to compute risk metrics for.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        quasi_ids: A set of columns that form a composite key and optionally
            their reidentification distributions.
        info_types: Type of information of the quasi_id in order to provide a
            statistical model of population.
        region_code: The ISO 3166-1 region code that the data is representative
            of. Can be omitted if using a region-specific infoType (such as
            US_ZIP_5)
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        'project_id': table_project_id,
        'dataset_id': dataset_id,
        'table_id': table_id
    }

    # Check that numbers of quasi-ids and info types are equal
    if len(quasi_ids) != len(info_types):
        raise ValueError("""Number of infoTypes and number of quasi-identifiers
                            must be equal!""")

    # Convert quasi id list to Protobuf type
    def map_fields(quasi_id, info_type):
        return {'field': {'name': quasi_id}, 'info_type': {'name': info_type}}

    quasi_ids = map(map_fields, quasi_ids, info_types)

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        'privacy_metric': {
            'k_map_estimation_config': {
                'quasi_ids': quasi_ids,
                'region_code': region_code
            }
        },
        'source_table': source_table,
        'actions': actions
    }

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)
    subscription = subscriber.subscribe(subscription_path)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    # Create helper function for unpacking values
    def get_values(obj):
        return int(obj.integer_value)

    def callback(message):
        try:
            if (message.attributes['DlpJobName'] == operation.name):
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                histogram_buckets = (job.risk_details
                                        .k_map_estimation_result
                                        .k_map_estimation_histogram)
                # Print bucket stats
                for i, bucket in enumerate(histogram_buckets):
                    print('Bucket {}:'.format(i))
                    print('   Anonymity range: [{}, {}]'.format(
                        bucket.min_anonymity, bucket.max_anonymity))
                    print('   Size: {}'.format(bucket.bucket_size))
                    for value_bucket in bucket.bucket_values:
                        print('   Values: {}'.format(
                            map(get_values, value_bucket.quasi_ids_values)))
                        print('   Estimated k-map anonymity: {}'.format(
                            value_bucket.estimated_anonymity))
                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscription.open(callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')