Exemple #1
0
def create_DLP_job(data, done):
    """This function is triggered by new files uploaded to the designated Cloud Storage quarantine/staging bucket.

         It creates a dlp job for the uploaded file.
      Arg:
         data: The Cloud Storage Event
      Returns:
          None. Debug information is printed to the log.
      """
    # Get the targeted file in the quarantine bucket
    file_name = data['name']
    log('Function triggered for file [{}] to start a DLP job of InfoTypes [{}]'
        .format(file_name, ','.join(INFO_TYPES)),
        severity=LOG_SEVERITY_INFO)

    # Prepare info_types by converting the list of strings (INFO_TYPES) into a list of dictionaries
    info_types = [{'name': info_type} for info_type in INFO_TYPES]

    # Convert the project id into a full resource id.
    parent = f"projects/{PROJECT_ID}"

    # Construct the configuration dictionary.
    inspect_job = {
        'inspect_config': {
            'info_types': info_types,
            'min_likelihood': MIN_LIKELIHOOD,
            'limits': {
                'max_findings_per_request': MAX_FINDINGS
            },
        },
        'storage_config': {
            'cloud_storage_options': {
                'file_set': {
                    'url':
                    'gs://{bucket_name}/{file_name}'.format(
                        bucket_name=STAGING_BUCKET, file_name=file_name)
                }
            }
        },
        'actions': [{
            'pub_sub': {
                'topic':
                'projects/{project_id}/topics/{topic_id}'.format(
                    project_id=PROJECT_ID, topic_id=PUB_SUB_TOPIC)
            }
        }]
    }

    # Create the DLP job and let the DLP api processes it.
    try:
        dlp.create_dlp_job(parent=(parent), inspect_job=(inspect_job))
        log('Job created by create_DLP_job', severity=LOG_SEVERITY_INFO)
    except Exception as e:
        log(e, severity=LOG_SEVERITY_ERROR)
def test_job_name():
    import google.cloud.dlp

    dlp = google.cloud.dlp_v2.DlpServiceClient()

    parent = dlp.project_path(GCLOUD_PROJECT)

    # Construct job request
    risk_job = {
        "privacy_metric": {
            "categorical_stats_config": {
                "field": {
                    "name": TEST_COLUMN_NAME
                }
            }
        },
        "source_table": {
            "project_id": TEST_TABLE_PROJECT_ID,
            "dataset_id": TEST_DATASET_ID,
            "table_id": TEST_TABLE_ID,
        },
    }

    response = dlp.create_dlp_job(parent, risk_job=risk_job)
    full_path = response.name
    # API expects only job name, not full project path
    job_name = full_path[full_path.rfind("/") + 1:]
    return job_name
def test_job_name():
    import google.cloud.dlp
    dlp = google.cloud.dlp.DlpServiceClient()

    parent = dlp.project_path(GCLOUD_PROJECT)

    # Construct job request
    risk_job = {
        'privacy_metric': {
            'categorical_stats_config': {
                'field': {
                    'name': TEST_COLUMN_NAME
                }
            }
        },
        'source_table': {
            'project_id': TEST_TABLE_PROJECT_ID,
            'dataset_id': TEST_DATASET_ID,
            'table_id': TEST_TABLE_ID
        }
    }

    response = dlp.create_dlp_job(parent, risk_job=risk_job)
    full_path = response.name
    # API expects only job name, not full project path
    job_name = full_path[full_path.rfind('/')+1:]
    return job_name
Exemple #4
0
def create_test_job():
    import google.cloud.dlp
    dlp = google.cloud.dlp.DlpServiceClient()

    parent = dlp.project_path(GCLOUD_PROJECT)

    # Construct job request
    risk_job = {
        'privacy_metric': {
            'categorical_stats_config': {
                'field': {
                    'name': TEST_COLUMN_NAME
                }
            }
        },
        'source_table': {
            'project_id': TEST_TABLE_PROJECT_ID,
            'dataset_id': TEST_DATASET_ID,
            'table_id': TEST_TABLE_ID
        }
    }

    response = dlp.create_dlp_job(parent, risk_job=risk_job)
    full_path = response.name
    # API expects only job name, not full project path
    job_name = full_path[full_path.rfind('/')+1:]
    return job_name
Exemple #5
0
def test_job_name():
    import google.cloud.dlp

    dlp = google.cloud.dlp_v2.DlpServiceClient()

    parent = dlp.project_path(GCLOUD_PROJECT)

    # Construct job request
    risk_job = {
        "privacy_metric": {
            "categorical_stats_config": {
                "field": {
                    "name": TEST_COLUMN_NAME
                }
            }
        },
        "source_table": {
            "project_id": TEST_TABLE_PROJECT_ID,
            "dataset_id": TEST_DATASET_ID,
            "table_id": TEST_TABLE_ID,
        },
    }

    response = dlp.create_dlp_job(parent, risk_job=risk_job)
    full_path = response.name
    # API expects only job name, not full project path
    job_name = full_path[full_path.rfind("/") + 1:]
    yield job_name

    # clean up job if not deleted
    try:
        dlp.delete_dlp_job(full_path)
    except google.api_core.exceptions.NotFound:
        print("Issue during teardown, missing job")
Exemple #6
0
def l_diversity_analysis(project, table_project_id, dataset_id, table_id,
                         topic_id, subscription_id, sensitive_attribute,
                         quasi_ids, timeout=300):
    """Uses the Data Loss Prevention API to compute the l-diversity of a
        column set in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        sensitive_attribute: The column to measure l-diversity relative to.
        quasi_ids: A set of columns that form a composite key.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        'project_id': table_project_id,
        'dataset_id': dataset_id,
        'table_id': table_id
    }

    # Convert quasi id list to Protobuf type
    def map_fields(field):
        return {'name': field}

    quasi_ids = map(map_fields, quasi_ids)

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        'privacy_metric': {
            'l_diversity_config': {
                'quasi_ids': quasi_ids,
                'sensitive_attribute': {
                    'name': sensitive_attribute
                }
            }
        },
        'source_table': source_table,
        'actions': actions
    }

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)
    subscription = subscriber.subscribe(subscription_path)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    # Create helper function for unpacking values
    def get_values(obj):
        return int(obj.integer_value)

    def callback(message):
        try:
            if (message.attributes['DlpJobName'] == operation.name):
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                histogram_buckets = (
                    job.risk_details
                       .l_diversity_result
                       .sensitive_value_frequency_histogram_buckets)
                # Print bucket stats
                for i, bucket in enumerate(histogram_buckets):
                    print('Bucket {}:'.format(i))
                    print('   Bucket size range: [{}, {}]'.format(
                        bucket.sensitive_value_frequency_lower_bound,
                        bucket.sensitive_value_frequency_upper_bound))
                    for value_bucket in bucket.bucket_values:
                        print('   Quasi-ID values: {}'.format(
                            map(get_values, value_bucket.quasi_ids_values)))
                        print('   Class size: {}'.format(
                            value_bucket.equivalence_class_size))
                        for value in value_bucket.top_sensitive_values:
                            print(('   Sensitive value {} occurs {} time(s)'
                                   .format(value.value, value.count)))
                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscription.open(callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
                     topic_id, subscription_id, info_types,
                     custom_dictionaries=None, custom_regexes=None,
                     min_likelihood=None, max_findings=None, timeout=300):
    """Uses the Data Loss Prevention API to analyze BigQuery data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        bigquery_project: The Google Cloud project id of the target table.
        dataset_id: The id of the target BigQuery dataset.
        table_id: The id of the target BigQuery table.
        topic_id: The id of the Cloud Pub/Sub topic to which the API will
            broadcast job completion. The topic must already exist.
        subscription_id: The id of the Cloud Pub/Sub subscription to listen on
            while waiting for job completion. The subscription must already
            exist and be subscribed to the topic.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        namespace_id: The namespace of the Datastore document, if applicable.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        timeout: The number of seconds to wait for a response from the API.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
    info_types = [{'name': info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [{
        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
        'dictionary': {
            'word_list': {'words': custom_dict.split(',')}
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
        'regex': {'pattern': custom_regex}
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'limits': {'max_findings_per_request': max_findings},
    }

    # Construct a storage_config containing the target Bigquery info.
    storage_config = {
        'big_query_options': {
            'table_reference': {
                'project_id': bigquery_project,
                'dataset_id': dataset_id,
                'table_id': table_id,
            }
        }
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Construct the inspect_job, which defines the entire inspect content task.
    inspect_job = {
        'inspect_config': inspect_config,
        'storage_config': storage_config,
        'actions': actions,
    }

    operation = dlp.create_dlp_job(parent, inspect_job=inspect_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    def callback(message):
        try:
            if (message.attributes['DlpJobName'] == operation.name):
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                if job.inspect_details.result.info_type_stats:
                    for finding in job.inspect_details.result.info_type_stats:
                        print('Info type: {}; Count: {}'.format(
                            finding.info_type.name, finding.count))
                else:
                    print('No findings.')

                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscriber.subscribe(subscription_path, callback=callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
Exemple #8
0
def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
                     info_types, cscc=False, output_project=None, output_dataset_id=None, output_table_id=None,
                     topic_id=None, subscription_id=None,
                     custom_dictionaries=None, custom_regexes=None,
                     min_likelihood=None, max_findings=None, timeout=300):
    """Uses the Data Loss Prevention API to analyze BigQuery data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        bigquery_project: The Google Cloud project id of the target table.
        dataset_id: The id of the target BigQuery dataset.
        table_id: The id of the target BigQuery table.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        cscc: Should the job publish findings to Cloud Security Command Center. Default is False. 
        output_project: The Google Cloud project id of the output table.
        output_dataset_id: The id of the output BigQuery dataset.
        output_table_id: The id of the output BigQuery table.
        topic_id: The id of the Cloud Pub/Sub topic to which the API will
            broadcast job completion. The topic must already exist.
        subscription_id: The id of the Cloud Pub/Sub subscription to listen on
            while waiting for job completion. The subscription must already
            exist and be subscribed to the topic.
        namespace_id: The namespace of the Datastore document, if applicable.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        timeout: The number of seconds to wait for a response from the API.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = [{'name': 'ALL_BASIC'}]
    else:
        info_types = [{'name': info_type} for info_type in info_types]


    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [{
        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
        'dictionary': {
            'word_list': {'words': custom_dict.split(',')}
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
        'regex': {'pattern': custom_regex}
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'limits': {'max_findings_per_request': max_findings},
    }

    # Construct a storage_config containing the target Bigquery info.
    storage_config = {
        'big_query_options': {
            'table_reference': {
                'project_id': bigquery_project,
                'dataset_id': dataset_id,
                'table_id': table_id,
            }
        }
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Tell the API where to send findings to.
    actions = []

    if cscc:
        actions.append({
        'publish_summary_to_cscc': {}
    })

    if output_project and output_dataset_id and output_table_id:
        actions.append({
            'save_findings': {
                'output_config': {
                    'table': {
                        'project_id': output_project,
                        'dataset_id': output_dataset_id,
                        'table_id': output_table_id
                    }
                }
            }
        })

    # Construct the inspect_job, which defines the entire inspect content task.
    inspect_job = {
        'inspect_config': inspect_config,
        'storage_config': storage_config,
        'actions': actions
    }

    dlp.create_dlp_job(parent, inspect_job=inspect_job)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()


    finished = job_done.wait(timeout=timeout)
    if not finished:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
Exemple #9
0
def k_map_estimate_analysis(
    project,
    table_project_id,
    dataset_id,
    table_id,
    topic_id,
    subscription_id,
    quasi_ids,
    info_types,
    region_code="US",
    timeout=300,
):
    """Uses the Data Loss Prevention API to compute the k-map risk estimation
        of a column set in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        column_name: The name of the column to compute risk metrics for.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        quasi_ids: A set of columns that form a composite key and optionally
            their reidentification distributions.
        info_types: Type of information of the quasi_id in order to provide a
            statistical model of population.
        region_code: The ISO 3166-1 region code that the data is representative
            of. Can be omitted if using a region-specific infoType (such as
            US_ZIP_5)
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # Create helper function for unpacking values
    def get_values(obj):
        return int(obj.integer_value)

    def callback(message):
        if message.attributes["DlpJobName"] == operation.name:
            # This is the message we're looking for, so acknowledge it.
            message.ack()

            # Now that the job is done, fetch the results and print them.
            job = dlp.get_dlp_job(operation.name)
            histogram_buckets = (
                job.risk_details.k_map_estimation_result.k_map_estimation_histogram
            )
            # Print bucket stats
            for i, bucket in enumerate(histogram_buckets):
                print("Bucket {}:".format(i))
                print(
                    "   Anonymity range: [{}, {}]".format(
                        bucket.min_anonymity, bucket.max_anonymity
                    )
                )
                print("   Size: {}".format(bucket.bucket_size))
                for value_bucket in bucket.bucket_values:
                    print(
                        "   Values: {}".format(
                            map(get_values, value_bucket.quasi_ids_values)
                        )
                    )
                    print(
                        "   Estimated k-map anonymity: {}".format(
                            value_bucket.estimated_anonymity
                        )
                    )
            subscription.set_result(None)
        else:
            # This is not the message we're looking for.
            message.drop()

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        "project_id": table_project_id,
        "dataset_id": dataset_id,
        "table_id": table_id,
    }

    # Check that numbers of quasi-ids and info types are equal
    if len(quasi_ids) != len(info_types):
        raise ValueError(
            """Number of infoTypes and number of quasi-identifiers
                            must be equal!"""
        )

    # Convert quasi id list to Protobuf type
    def map_fields(quasi_id, info_type):
        return {"field": {"name": quasi_id}, "info_type": {"name": info_type}}

    quasi_ids = map(map_fields, quasi_ids, info_types)

    # Tell the API where to send a notification when the job is complete.
    actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        "privacy_metric": {
            "k_map_estimation_config": {
                "quasi_ids": quasi_ids,
                "region_code": region_code,
            }
        },
        "source_table": source_table,
        "actions": actions,
    }

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(project, subscription_id)
    subscription = subscriber.subscribe(subscription_path, callback)

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    try:
        subscription.result(timeout=timeout)
    except TimeoutError:
        print(
            "No event received before the timeout. Please verify that the "
            "subscription provided is subscribed to the topic provided."
        )
        subscription.close()
Exemple #10
0
def l_diversity_analysis(
    project,
    table_project_id,
    dataset_id,
    table_id,
    topic_id,
    subscription_id,
    sensitive_attribute,
    quasi_ids,
    timeout=300,
):
    """Uses the Data Loss Prevention API to compute the l-diversity of a
        column set in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        sensitive_attribute: The column to measure l-diversity relative to.
        quasi_ids: A set of columns that form a composite key.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # Create helper function for unpacking values
    def get_values(obj):
        return int(obj.integer_value)

    def callback(message):
        if message.attributes["DlpJobName"] == operation.name:
            # This is the message we're looking for, so acknowledge it.
            message.ack()

            # Now that the job is done, fetch the results and print them.
            job = dlp.get_dlp_job(operation.name)
            histogram_buckets = (
                job.risk_details.l_diversity_result.sensitive_value_frequency_histogram_buckets
            )
            # Print bucket stats
            for i, bucket in enumerate(histogram_buckets):
                print("Bucket {}:".format(i))
                print(
                    "   Bucket size range: [{}, {}]".format(
                        bucket.sensitive_value_frequency_lower_bound,
                        bucket.sensitive_value_frequency_upper_bound,
                    )
                )
                for value_bucket in bucket.bucket_values:
                    print(
                        "   Quasi-ID values: {}".format(
                            map(get_values, value_bucket.quasi_ids_values)
                        )
                    )
                    print(
                        "   Class size: {}".format(value_bucket.equivalence_class_size)
                    )
                    for value in value_bucket.top_sensitive_values:
                        print(
                            (
                                "   Sensitive value {} occurs {} time(s)".format(
                                    value.value, value.count
                                )
                            )
                        )
            subscription.set_result(None)
        else:
            # This is not the message we're looking for.
            message.drop()

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        "project_id": table_project_id,
        "dataset_id": dataset_id,
        "table_id": table_id,
    }

    # Convert quasi id list to Protobuf type
    def map_fields(field):
        return {"name": field}

    quasi_ids = map(map_fields, quasi_ids)

    # Tell the API where to send a notification when the job is complete.
    actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        "privacy_metric": {
            "l_diversity_config": {
                "quasi_ids": quasi_ids,
                "sensitive_attribute": {"name": sensitive_attribute},
            }
        },
        "source_table": source_table,
        "actions": actions,
    }

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(project, subscription_id)
    subscription = subscriber.subscribe(subscription_path, callback)

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    try:
        subscription.result(timeout=timeout)
    except TimeoutError:
        print(
            "No event received before the timeout. Please verify that the "
            "subscription provided is subscribed to the topic provided."
        )
        subscription.close()
Exemple #11
0
def numerical_risk_analysis(
    project,
    table_project_id,
    dataset_id,
    table_id,
    column_name,
    topic_id,
    subscription_id,
    timeout=300,
):
    """Uses the Data Loss Prevention API to compute risk metrics of a column
       of numerical data in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        column_name: The name of the column to compute risk metrics for.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    def callback(message):
        if message.attributes["DlpJobName"] == operation.name:
            # This is the message we're looking for, so acknowledge it.
            message.ack()

            # Now that the job is done, fetch the results and print them.
            job = dlp.get_dlp_job(operation.name)
            results = job.risk_details.numerical_stats_result
            print(
                "Value Range: [{}, {}]".format(
                    results.min_value.integer_value, results.max_value.integer_value
                )
            )
            prev_value = None
            for percent, result in enumerate(results.quantile_values):
                value = result.integer_value
                if prev_value != value:
                    print("Value at {}% quantile: {}".format(percent, value))
                    prev_value = value
            subscription.set_result(None)
        else:
            # This is not the message we're looking for.
            message.drop()

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        "project_id": table_project_id,
        "dataset_id": dataset_id,
        "table_id": table_id,
    }

    # Tell the API where to send a notification when the job is complete.
    actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        "privacy_metric": {"numerical_stats_config": {"field": {"name": column_name}}},
        "source_table": source_table,
        "actions": actions,
    }

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(project, subscription_id)
    subscription = subscriber.subscribe(subscription_path, callback)

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    try:
        subscription.result(timeout=timeout)
    except TimeoutError:
        print(
            "No event received before the timeout. Please verify that the "
            "subscription provided is subscribed to the topic provided."
        )
        subscription.close()
Exemple #12
0
def categorical_risk_analysis(
    project,
    table_project_id,
    dataset_id,
    table_id,
    column_name,
    topic_id,
    subscription_id,
    timeout=300,
):
    """Uses the Data Loss Prevention API to compute risk metrics of a column
       of categorical data in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        column_name: The name of the column to compute risk metrics for.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into full resource ids.
    topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
    parent = f"projects/{project}/locations/global"

    # Location info of the BigQuery table.
    source_table = {
        "project_id": table_project_id,
        "dataset_id": dataset_id,
        "table_id": table_id,
    }

    # Tell the API where to send a notification when the job is complete.
    actions = [{"pub_sub": {"topic": topic}}]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        "privacy_metric": {
            "categorical_stats_config": {
                "field": {
                    "name": column_name
                }
            }
        },
        "source_table": source_table,
        "actions": actions,
    }

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(request={
        "parent": parent,
        "risk_job": risk_job
    })

    def callback(message):
        if message.attributes["DlpJobName"] == operation.name:
            # This is the message we're looking for, so acknowledge it.
            message.ack()

            # Now that the job is done, fetch the results and print them.
            job = dlp.get_dlp_job(request={"name": operation.name})
            histogram_buckets = (
                job.risk_details.categorical_stats_result.
                value_frequency_histogram_buckets  # noqa: E501
            )
            # Print bucket stats
            for i, bucket in enumerate(histogram_buckets):
                print("Bucket {}:".format(i))
                print("   Most common value occurs {} time(s)".format(
                    bucket.value_frequency_upper_bound))
                print("   Least common value occurs {} time(s)".format(
                    bucket.value_frequency_lower_bound))
                print("   {} unique values total.".format(bucket.bucket_size))
                for value in bucket.bucket_values:
                    print("   Value {} occurs {} time(s)".format(
                        value.value.integer_value, value.count))
            subscription.set_result(None)
        else:
            # This is not the message we're looking for.
            message.drop()

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(project, subscription_id)
    subscription = subscriber.subscribe(subscription_path, callback)

    try:
        subscription.result(timeout=timeout)
    except TimeoutError:
        print("No event received before the timeout. Please verify that the "
              "subscription provided is subscribed to the topic provided.")
        subscription.close()
def dlp(request):
  from google.cloud import bigquery
  import os
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="C:\gcp_credentials\elaborate-howl-285701-105c2e8355a8.json"
  client_bigquery = bigquery.Client()#bigquery client
  import uuid
  import google.cloud.dlp
  import time
  
  
  uuid=str(uuid.uuid4())
  print(uuid)
  request_json = request.get_json()#json message received from http request

  if request_json:
    file_name=request_json["file_name"]
    print(file_name)
    #query of creating table start
    
  query="""

      create table `elaborate-howl-285701.context.{uuid}_dlp` as SELECT * FROM `elaborate-howl-285701.context.form_key_pair` 
  where file_name=\"{file_name}\";
  """.format(uuid=uuid,file_name=file_name)
  #query of creating table end
  job_config = bigquery.QueryJobConfig()
  query_job = client_bigquery.query(query, location="US", job_config=job_config)
  query_job.result()
  #dlp work start

  project='elaborate-howl-285701'
  bigquery_project='elaborate-howl-285701'
  dataset_id='context'
  table_id=uuid+'_dlp'
  min_likelihood=None,
  max_findings=None,
  parent = f"projects/{project}/locations/global"

  inspect_job_data = {
      'storage_config': {
          
          'big_query_options': {
              'table_reference': {
                  
                  'project_id': bigquery_project,
                  'dataset_id': dataset_id,
                  'table_id': table_id
                  
              },
              'identifying_fields':[
                  {
                    'name':'file_name',
                  }
              ],
              'excluded_fields':[
                  {
                    'name':'field_name',
                    'name':'time_stamp',
                    'name':'validated_field_name',
                    'name':'validated_field_value',
                    'name':'updated_date',
                    'name':'confidence',
                    'name':'updated_by',
                    'name':'key_x1',   
                    'name':'key_x2',
                    'name':'key_y1',
                    'name':'key_y2',
                    'name':'value_x1', 
                    'name':'value_x2',   
                    'name':'value_y1',
                    'name':'value_y2',
                    'name':'pageNumber',
                    'name':'id', 
                    'name':'type'

                  }
              ],
              
              'rows_limit':10000,
              'sample_method':'TOP',
          },
      },
      'inspect_config': {
          'info_types': [{'name': 'FIRST_NAME'}, {'name': 'LAST_NAME'}, {'name': 'EMAIL_ADDRESS'},{'name': 'AGE'}, {'name': 'CREDIT_CARD_NUMBER'}, {'name': 'DATE'},{'name': 'DATE_OF_BIRTH'}, {'name': 'DOMAIN_NAME'}, {'name': 'EMAIL_ADDRESS'},
           {'name': 'US_EMPLOYER_IDENTIFICATION_NUMBER'}, {'name': 'US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER'},{'name': 'US_PREPARER_TAXPAYER_IDENTIFICATION_NUMBER'}, {'name': 'US_SOCIAL_SECURITY_NUMBER'}, {'name': 'US_VEHICLE_IDENTIFICATION_NUMBER'},
           {'name': 'US_TOLLFREE_PHONE_NUMBER'}, {'name': 'US_STATE'}, {'name': 'US_PASSPORT'},{'name': 'US_HEALTHCARE_NPI'}, {'name': 'GENDER'}, {'name': 'LOCATION'}, {'name': 'PASSPORT'}, {'name': 'PASSWORD'},
            {'name': 'PHONE_NUMBER'}, {'name': 'STREET_ADDRESS'},{'name': 'URL'}, {'name': 'US_BANK_ROUTING_MICR'}, {'name': 'US_DEA_NUMBER'},{'name': 'US_DRIVERS_LICENSE_NUMBER'}],
          "include_quote": True,
          "min_likelihood": 2,
      },
      'actions': [
          {
              'save_findings': {
                  'output_config':{
                      'table':{
                          'project_id': bigquery_project,
                          'dataset_id': dataset_id,
                          'table_id': '{}_job'.format(table_id)
                      }
                  }
                  
              },
          },
      ]
  }
  dlp = google.cloud.dlp_v2.DlpServiceClient()
  operation = dlp.create_dlp_job(parent=parent, inspect_job=inspect_job_data)

  time.sleep(200)



  #dlp work end
  #query for dropping created table
  query2="""
  drop table  `elaborate-howl-285701.context.{table_id}`;
    
  """.format(table_id=table_id)
  #query of creating table end
  job_config = bigquery.QueryJobConfig()
  query_job2 = client_bigquery.query(query2, location="US", job_config=job_config)
  query_job2.result()

  #checking rows in form_key_pair table

  destination_table = client_bigquery.get_table('elaborate-howl-285701.context.form_key_pair_dlp')  # Make an API request.
  print("before insertion {} rows.".format(destination_table.num_rows))



  #copy data loss prevention on desired form_key_pair_dlp
  query3="""
  INSERT INTO `elaborate-howl-285701.context.form_key_pair_dlp`
  SELECT * FROM `elaborate-howl-285701.context.{tableid2}_job`
  """.format(tableid2=table_id)
  print(query3)
  #query of creating table end
  job_config = bigquery.QueryJobConfig()
  query_job3 = client_bigquery.query(query3, location="US", job_config=job_config)
  query_job3.result()
  #time.sleep(30)
  #checking rows in form_key_pair table

  destination_table = client_bigquery.get_table('elaborate-howl-285701.context.form_key_pair_dlp')  # Make an API request.
  print("after insertion {} rows.".format(destination_table.num_rows))

  job = dlp.get_dlp_job(request={"name": operation.name})
  result_count=""
  if job.inspect_details.result.info_type_stats:

    for finding in job.inspect_details.result.info_type_stats:

        result_="Info type: {}; Count: {}".format(finding.info_type.name, finding.count)
        result_count=result_+result_count+'\n'
        print(result_count)


  #query for dropping dlp table
  query4="""
  drop table  `elaborate-howl-285701.context.{table_id2}_job`;
  """.format(table_id2=table_id)
  #query of creating table end
  job_config = bigquery.QueryJobConfig()
  query_job4 = client_bigquery.query(query4, location="US", job_config=job_config)
  query_job4.result()

  ## work for neo4j starts

  query5 = """
    select distinct  a.field_value, a.field_name, b.info_type.name as info_, b.likelihood from `elaborate-howl-285701.context.form_key_pair` a,
    `elaborate-howl-285701.context.form_key_pair_dlp` b
    where a.file_name=\"{file_name}\"
    and lower(a.field_value)=lower(b.quote);
    """.format(file_name=file_name)
  query_job5 = client_bigquery.query(
      query5,
    # Location must match that of the dataset(s) referenced in the query.
      location="US",
  )  # API request - starts the query

  df = query_job5.to_dataframe()
  f_value=[]
  for a in df.field_value:
    f_value.append(a)

  f_name=[]
  for b in df.field_name:
    f_name.append(b)

  info_name=[]
  for c in df.info_:
    info_name.append(c)    
  from neo4j import GraphDatabase
  import logging
  from neo4j.exceptions import ServiceUnavailable
  class App:


    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
    def close(self):
            # Don't forget to close the driver connection when you are finished with it
            self.driver.close()
    def create_friendship(self,file_name,field_value,field_name,info_):
            with self.driver.session() as session:
                # Write transactions allow the driver to handle retries and transient errors
                result = session.write_transaction(
                    self._create_and_return_friendship, file_name, field_value,field_name,info_)
                print(result)
                #for row in result:
                #    print("Created relation between: {n}, {m} ".format(n=row['n'], m=row['m']))
                #    print("Created relation between: {n}, {e} ".format(n=row['n'], e=row['e']))
                #    print("Created relation between: {e}, {m} ".format(e=row['e'], m=row['m']))
                #   print("Created relation between: {m}, {w} ".format(m=row['m'], w=row['w']))
    @staticmethod
    def _create_and_return_friendship(tx, file_name, field_value,field_name,info_):
        # To learn more about the Cypher syntax, see https://neo4j.com/docs/cypher-manual/current/
        # The Reference Card is also a good resource for keywords https://neo4j.com/docs/cypher-refcard/current/
        query = """
        merge (n:File {Name: $file_name})
        merge (m:FIELD {Name: $field_name})
        merge (e:VALUE {value: $field_value})
        merge (w:DLP_Classification {NAME: $info_})

        merge (n)-[p:CONTAINS_FIELD]->(m)
        merge (n)-[q:CONTAINS_VALUE]->(e)
        merge (e)-[r:TYPE_IS]->(m)
        merge (m)-[s:DATA_Classification]->(w)


        
        RETURN n, m, e, w, p, q, r, s
        """
        result = tx.run(query, file_name=file_name, field_value=field_value,field_name=field_name,info_=info_)
        try:
            return [{"n": row["n"]["name"], "e": row["e"]["address"]}
                    for row in result]
        # Capture any errors along with the query and data for traceability
        except ServiceUnavailable as exception:
            logging.error("{query} raised an error: \n {exception}".format(
                query=query, exception=exception))
            raise
  import itertools
  for (a,b,c) in zip(f_value,f_name,info_name):
    print(a+','+b+','+c)
    bolt_url = "neo4j+s://cfb079ca.databases.neo4j.io"
    user = "******"
    password = "******"
    app = App(bolt_url, user, password)
    app.create_friendship(file_name, a,b,c)
    app.close()


  

  return "df"
Exemple #14
0
def create_DLP_job(data, done):
    """This function is triggered by new files uploaded to the designated Cloud Storage quarantine/staging bucket.

       It creates a dlp job for the uploaded file.
    Arg:
       data: The Cloud Storage Event
    Returns:
        None. Debug information is printed to the log.
    """
    # Get the targeted file in the quarantine bucket
    file_name = data['name']
    print('Function triggered for file [{}]'.format(file_name))

    # Prepare info_types by converting the list of strings (INFO_TYPES) into a list of dictionaries
    info_types = [{'name': info_type} for info_type in INFO_TYPES]

    # Convert the project id into a full resource id.
    parent = dlp.project_path(PROJECT_ID)

    # Construct the configuration dictionary.
    inspect_job = {
        'inspect_config': {
            'info_types': info_types,
            'min_likelihood': MIN_LIKELIHOOD,
            'limits': {
                'max_findings_per_request': MAX_FINDINGS
            },
        },
        'storage_config': {
            'cloud_storage_options': {
                'file_set': {
                    'url':
                    'gs://{bucket_name}/{file_name}'.format(
                        bucket_name=STAGING_BUCKET, file_name=file_name)
                }
            }
        },
        'actions': [{
            'pub_sub': {
                'topic':
                'projects/{project_id}/topics/{topic_id}'.format(
                    project_id=PROJECT_ID, topic_id=PUB_SUB_TOPIC)
            }
        }, {
            'save_findings': {
                'output_config': {
                    'table': {
                        'project_id': PROJECT_ID,
                        'dataset_id': DATASET_ID,
                        'table_id': TABLE_ID
                    }
                }
            }
        }, {
            'publish_summary_to_cscc': {}
        }]
    }

    # Create the DLP job and let the DLP api processes it.
    try:
        dlp.create_dlp_job(parent, inspect_job)
        print('Job created by create_DLP_job')
    except Exception as e:
        print(e)
Exemple #15
0
def numerical_risk_analysis(project, table_project_id, dataset_id, table_id,
                            column_name, topic_id, subscription_id,
                            timeout=300):
    """Uses the Data Loss Prevention API to compute risk metrics of a column
       of numerical data in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        column_name: The name of the column to compute risk metrics for.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        'project_id': table_project_id,
        'dataset_id': dataset_id,
        'table_id': table_id
    }

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        'privacy_metric': {
            'numerical_stats_config': {
                'field': {
                    'name': column_name
                }
            }
        },
        'source_table': source_table,
        'actions': actions
    }

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)
    subscription = subscriber.subscribe(subscription_path)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    def callback(message):
        try:
            if (message.attributes['DlpJobName'] == operation.name):
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                results = job.risk_details.numerical_stats_result
                print('Value Range: [{}, {}]'.format(
                    results.min_value.integer_value,
                    results.max_value.integer_value))
                prev_value = None
                for percent, result in enumerate(results.quantile_values):
                    value = result.integer_value
                    if prev_value != value:
                        print('Value at {}% quantile: {}'.format(
                              percent, value))
                        prev_value = value
                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscription.open(callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
def inspect_gcs_file(project,
                     bucket,
                     filename,
                     topic_id,
                     subscription_id,
                     info_types,
                     custom_dictionaries=None,
                     custom_regexes=None,
                     min_likelihood=None,
                     max_findings=None,
                     timeout=300):
    """Uses the Data Loss Prevention API to analyze a file on GCS.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        bucket: The name of the GCS bucket containing the file, as a string.
        filename: The name of the file in the bucket, including the path, as a
            string; e.g. 'images/myfile.png'.
        topic_id: The id of the Cloud Pub/Sub topic to which the API will
            broadcast job completion. The topic must already exist.
        subscription_id: The id of the Cloud Pub/Sub subscription to listen on
            while waiting for job completion. The subscription must already
            exist and be subscribed to the topic.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        timeout: The number of seconds to wait for a response from the API.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
    info_types = [{'name': info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [{
        'info_type': {
            'name': 'CUSTOM_DICTIONARY_{}'.format(i)
        },
        'dictionary': {
            'word_list': {
                'words': custom_dict.split(',')
            }
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {
            'name': 'CUSTOM_REGEX_{}'.format(i)
        },
        'regex': {
            'pattern': custom_regex
        }
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'limits': {
            'max_findings_per_request': max_findings
        },
    }

    # Construct a storage_config containing the file's URL.
    url = 'gs://{}/{}'.format(bucket, filename)
    storage_config = {'cloud_storage_options': {'file_set': {'url': url}}}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Tell the API where to send a notification when the job is complete.
    actions = [{'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}}]

    # Construct the inspect_job, which defines the entire inspect content task.
    inspect_job = {
        'inspect_config': inspect_config,
        'storage_config': storage_config,
        'actions': actions,
    }

    operation = dlp.create_dlp_job(parent, inspect_job=inspect_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(project, subscription_id)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    def callback(message):
        try:
            if (message.attributes['DlpJobName'] == operation.name):
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                if job.inspect_details.result.info_type_stats:
                    for finding in job.inspect_details.result.info_type_stats:
                        print('Info type: {}; Count: {}'.format(
                            finding.info_type.name, finding.count))
                else:
                    print('No findings.')

                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    subscriber.subscribe(subscription_path, callback=callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
def categorical_risk_analysis(project, table_project_id, dataset_id, table_id,
                              column_name, topic_id, subscription_id,
                              timeout=300):
    """Uses the Data Loss Prevention API to compute risk metrics of a column
       of categorical data in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        column_name: The name of the column to compute risk metrics for.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    def callback(message):
        if (message.attributes['DlpJobName'] == operation.name):
            # This is the message we're looking for, so acknowledge it.
            message.ack()

            # Now that the job is done, fetch the results and print them.
            job = dlp.get_dlp_job(operation.name)
            histogram_buckets = (job.risk_details
                                    .categorical_stats_result
                                    .value_frequency_histogram_buckets)
            # Print bucket stats
            for i, bucket in enumerate(histogram_buckets):
                print('Bucket {}:'.format(i))
                print('   Most common value occurs {} time(s)'.format(
                    bucket.value_frequency_upper_bound))
                print('   Least common value occurs {} time(s)'.format(
                    bucket.value_frequency_lower_bound))
                print('   {} unique values total.'.format(
                    bucket.bucket_size))
                for value in bucket.bucket_values:
                    print('   Value {} occurs {} time(s)'.format(
                        value.value.integer_value, value.count))
            subscription.set_result(None)
        else:
            # This is not the message we're looking for.
            message.drop()

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        'project_id': table_project_id,
        'dataset_id': dataset_id,
        'table_id': table_id
    }

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        'privacy_metric': {
            'categorical_stats_config': {
                'field': {
                    'name': column_name
                }
            }
        },
        'source_table': source_table,
        'actions': actions
    }

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)
    subscription = subscriber.subscribe(subscription_path, callback)

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    try:
        subscription.result(timeout=timeout)
    except TimeoutError:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
        subscription.close()
def k_anonymity_analysis(project, table_project_id, dataset_id, table_id,
                         topic_id, subscription_id, quasi_ids, timeout=300):
    """Uses the Data Loss Prevention API to compute the k-anonymity of a
        column set in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        quasi_ids: A set of columns that form a composite key.
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # Create helper function for unpacking values
    def get_values(obj):
        return int(obj.integer_value)

    def callback(message):
        if (message.attributes['DlpJobName'] == operation.name):
            # This is the message we're looking for, so acknowledge it.
            message.ack()

            # Now that the job is done, fetch the results and print them.
            job = dlp.get_dlp_job(operation.name)
            histogram_buckets = (job.risk_details
                                    .k_anonymity_result
                                    .equivalence_class_histogram_buckets)
            # Print bucket stats
            for i, bucket in enumerate(histogram_buckets):
                print('Bucket {}:'.format(i))
                if bucket.equivalence_class_size_lower_bound:
                    print('   Bucket size range: [{}, {}]'.format(
                        bucket.equivalence_class_size_lower_bound,
                        bucket.equivalence_class_size_upper_bound))
                    for value_bucket in bucket.bucket_values:
                        print('   Quasi-ID values: {}'.format(
                            map(get_values, value_bucket.quasi_ids_values)
                        ))
                        print('   Class size: {}'.format(
                            value_bucket.equivalence_class_size))
            subscription.set_result(None)
        else:
            # This is not the message we're looking for.
            message.drop()

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        'project_id': table_project_id,
        'dataset_id': dataset_id,
        'table_id': table_id
    }

    # Convert quasi id list to Protobuf type
    def map_fields(field):
        return {'name': field}

    quasi_ids = map(map_fields, quasi_ids)

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        'privacy_metric': {
            'k_anonymity_config': {
                'quasi_ids': quasi_ids
            }
        },
        'source_table': source_table,
        'actions': actions
    }

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)
    subscription = subscriber.subscribe(subscription_path, callback)

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    try:
        subscription.result(timeout=timeout)
    except TimeoutError:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')
        subscription.close()
Exemple #19
0
def inspect_bigquery(
    project,
    bigquery_project,
    dataset_id,
    table_id,
    topic_id,
    subscription_id,
    info_types,
    custom_dictionaries=None,
    custom_regexes=None,
    min_likelihood=None,
    max_findings=None,
    timeout=300,
):
    """Uses the Data Loss Prevention API to analyze BigQuery data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        bigquery_project: The Google Cloud project id of the target table.
        dataset_id: The id of the target BigQuery dataset.
        table_id: The id of the target BigQuery table.
        topic_id: The id of the Cloud Pub/Sub topic to which the API will
            broadcast job completion. The topic must already exist.
        subscription_id: The id of the Cloud Pub/Sub subscription to listen on
            while waiting for job completion. The subscription must already
            exist and be subscribed to the topic.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        namespace_id: The namespace of the Datastore document, if applicable.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        timeout: The number of seconds to wait for a response from the API.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"]
    info_types = [{"name": info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [
        {
            "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
            "dictionary": {"word_list": {"words": custom_dict.split(",")}},
        }
        for i, custom_dict in enumerate(custom_dictionaries)
    ]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [
        {
            "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
            "regex": {"pattern": custom_regex},
        }
        for i, custom_regex in enumerate(custom_regexes)
    ]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": info_types,
        "custom_info_types": custom_info_types,
        "min_likelihood": min_likelihood,
        "limits": {"max_findings_per_request": max_findings},
    }

    # Construct a storage_config containing the target Bigquery info.
    storage_config = {
        "big_query_options": {
            "table_reference": {
                "project_id": bigquery_project,
                "dataset_id": dataset_id,
                "table_id": table_id,
            }
        }
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Tell the API where to send a notification when the job is complete.
    actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}]

    # Construct the inspect_job, which defines the entire inspect content task.
    inspect_job = {
        "inspect_config": inspect_config,
        "storage_config": storage_config,
        "actions": actions,
    }

    operation = dlp.create_dlp_job(parent, inspect_job=inspect_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(project, subscription_id)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    def callback(message):
        try:
            if message.attributes["DlpJobName"] == operation.name:
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                if job.inspect_details.result.info_type_stats:
                    for finding in job.inspect_details.result.info_type_stats:
                        print(
                            "Info type: {}; Count: {}".format(
                                finding.info_type.name, finding.count
                            )
                        )
                else:
                    print("No findings.")

                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscriber.subscribe(subscription_path, callback=callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print(
            "No event received before the timeout. Please verify that the "
            "subscription provided is subscribed to the topic provided."
        )
Exemple #20
0
def k_map_estimate_analysis(project, table_project_id, dataset_id, table_id,
                            topic_id, subscription_id, quasi_ids, info_types,
                            region_code='US', timeout=300):
    """Uses the Data Loss Prevention API to compute the k-map risk estimation
        of a column set in a Google BigQuery table.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        table_project_id: The Google Cloud project id where the BigQuery table
            is stored.
        dataset_id: The id of the dataset to inspect.
        table_id: The id of the table to inspect.
        column_name: The name of the column to compute risk metrics for.
        topic_id: The name of the Pub/Sub topic to notify once the job
            completes.
        subscription_id: The name of the Pub/Sub subscription to use when
            listening for job completion notifications.
        quasi_ids: A set of columns that form a composite key and optionally
            their reidentification distributions.
        info_types: Type of information of the quasi_id in order to provide a
            statistical model of population.
        region_code: The ISO 3166-1 region code that the data is representative
            of. Can be omitted if using a region-specific infoType (such as
            US_ZIP_5)
        timeout: The number of seconds to wait for a response from the API.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # This sample additionally uses Cloud Pub/Sub to receive results from
    # potentially long-running operations.
    import google.cloud.pubsub

    # This sample also uses threading.Event() to wait for the job to finish.
    import threading

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Location info of the BigQuery table.
    source_table = {
        'project_id': table_project_id,
        'dataset_id': dataset_id,
        'table_id': table_id
    }

    # Check that numbers of quasi-ids and info types are equal
    if len(quasi_ids) != len(info_types):
        raise ValueError("""Number of infoTypes and number of quasi-identifiers
                            must be equal!""")

    # Convert quasi id list to Protobuf type
    def map_fields(quasi_id, info_type):
        return {'field': {'name': quasi_id}, 'info_type': {'name': info_type}}

    quasi_ids = map(map_fields, quasi_ids, info_types)

    # Tell the API where to send a notification when the job is complete.
    actions = [{
        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
    }]

    # Configure risk analysis job
    # Give the name of the numeric column to compute risk metrics for
    risk_job = {
        'privacy_metric': {
            'k_map_estimation_config': {
                'quasi_ids': quasi_ids,
                'region_code': region_code
            }
        },
        'source_table': source_table,
        'actions': actions
    }

    # Call API to start risk analysis job
    operation = dlp.create_dlp_job(parent, risk_job=risk_job)

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(
        project, subscription_id)
    subscription = subscriber.subscribe(subscription_path)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    # Create helper function for unpacking values
    def get_values(obj):
        return int(obj.integer_value)

    def callback(message):
        try:
            if (message.attributes['DlpJobName'] == operation.name):
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(operation.name)
                histogram_buckets = (job.risk_details
                                        .k_map_estimation_result
                                        .k_map_estimation_histogram)
                # Print bucket stats
                for i, bucket in enumerate(histogram_buckets):
                    print('Bucket {}:'.format(i))
                    print('   Anonymity range: [{}, {}]'.format(
                        bucket.min_anonymity, bucket.max_anonymity))
                    print('   Size: {}'.format(bucket.bucket_size))
                    for value_bucket in bucket.bucket_values:
                        print('   Values: {}'.format(
                            map(get_values, value_bucket.quasi_ids_values)))
                        print('   Estimated k-map anonymity: {}'.format(
                            value_bucket.estimated_anonymity))
                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()
        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscription.open(callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print('No event received before the timeout. Please verify that the '
              'subscription provided is subscribed to the topic provided.')