def create_DLP_job(data, done): """This function is triggered by new files uploaded to the designated Cloud Storage quarantine/staging bucket. It creates a dlp job for the uploaded file. Arg: data: The Cloud Storage Event Returns: None. Debug information is printed to the log. """ # Get the targeted file in the quarantine bucket file_name = data['name'] log('Function triggered for file [{}] to start a DLP job of InfoTypes [{}]' .format(file_name, ','.join(INFO_TYPES)), severity=LOG_SEVERITY_INFO) # Prepare info_types by converting the list of strings (INFO_TYPES) into a list of dictionaries info_types = [{'name': info_type} for info_type in INFO_TYPES] # Convert the project id into a full resource id. parent = f"projects/{PROJECT_ID}" # Construct the configuration dictionary. inspect_job = { 'inspect_config': { 'info_types': info_types, 'min_likelihood': MIN_LIKELIHOOD, 'limits': { 'max_findings_per_request': MAX_FINDINGS }, }, 'storage_config': { 'cloud_storage_options': { 'file_set': { 'url': 'gs://{bucket_name}/{file_name}'.format( bucket_name=STAGING_BUCKET, file_name=file_name) } } }, 'actions': [{ 'pub_sub': { 'topic': 'projects/{project_id}/topics/{topic_id}'.format( project_id=PROJECT_ID, topic_id=PUB_SUB_TOPIC) } }] } # Create the DLP job and let the DLP api processes it. try: dlp.create_dlp_job(parent=(parent), inspect_job=(inspect_job)) log('Job created by create_DLP_job', severity=LOG_SEVERITY_INFO) except Exception as e: log(e, severity=LOG_SEVERITY_ERROR)
def test_job_name(): import google.cloud.dlp dlp = google.cloud.dlp_v2.DlpServiceClient() parent = dlp.project_path(GCLOUD_PROJECT) # Construct job request risk_job = { "privacy_metric": { "categorical_stats_config": { "field": { "name": TEST_COLUMN_NAME } } }, "source_table": { "project_id": TEST_TABLE_PROJECT_ID, "dataset_id": TEST_DATASET_ID, "table_id": TEST_TABLE_ID, }, } response = dlp.create_dlp_job(parent, risk_job=risk_job) full_path = response.name # API expects only job name, not full project path job_name = full_path[full_path.rfind("/") + 1:] return job_name
def test_job_name(): import google.cloud.dlp dlp = google.cloud.dlp.DlpServiceClient() parent = dlp.project_path(GCLOUD_PROJECT) # Construct job request risk_job = { 'privacy_metric': { 'categorical_stats_config': { 'field': { 'name': TEST_COLUMN_NAME } } }, 'source_table': { 'project_id': TEST_TABLE_PROJECT_ID, 'dataset_id': TEST_DATASET_ID, 'table_id': TEST_TABLE_ID } } response = dlp.create_dlp_job(parent, risk_job=risk_job) full_path = response.name # API expects only job name, not full project path job_name = full_path[full_path.rfind('/')+1:] return job_name
def create_test_job(): import google.cloud.dlp dlp = google.cloud.dlp.DlpServiceClient() parent = dlp.project_path(GCLOUD_PROJECT) # Construct job request risk_job = { 'privacy_metric': { 'categorical_stats_config': { 'field': { 'name': TEST_COLUMN_NAME } } }, 'source_table': { 'project_id': TEST_TABLE_PROJECT_ID, 'dataset_id': TEST_DATASET_ID, 'table_id': TEST_TABLE_ID } } response = dlp.create_dlp_job(parent, risk_job=risk_job) full_path = response.name # API expects only job name, not full project path job_name = full_path[full_path.rfind('/')+1:] return job_name
def test_job_name(): import google.cloud.dlp dlp = google.cloud.dlp_v2.DlpServiceClient() parent = dlp.project_path(GCLOUD_PROJECT) # Construct job request risk_job = { "privacy_metric": { "categorical_stats_config": { "field": { "name": TEST_COLUMN_NAME } } }, "source_table": { "project_id": TEST_TABLE_PROJECT_ID, "dataset_id": TEST_DATASET_ID, "table_id": TEST_TABLE_ID, }, } response = dlp.create_dlp_job(parent, risk_job=risk_job) full_path = response.name # API expects only job name, not full project path job_name = full_path[full_path.rfind("/") + 1:] yield job_name # clean up job if not deleted try: dlp.delete_dlp_job(full_path) except google.api_core.exceptions.NotFound: print("Issue during teardown, missing job")
def l_diversity_analysis(project, table_project_id, dataset_id, table_id, topic_id, subscription_id, sensitive_attribute, quasi_ids, timeout=300): """Uses the Data Loss Prevention API to compute the l-diversity of a column set in a Google BigQuery table. Args: project: The Google Cloud project id to use as a parent resource. table_project_id: The Google Cloud project id where the BigQuery table is stored. dataset_id: The id of the dataset to inspect. table_id: The id of the table to inspect. topic_id: The name of the Pub/Sub topic to notify once the job completes. subscription_id: The name of the Pub/Sub subscription to use when listening for job completion notifications. sensitive_attribute: The column to measure l-diversity relative to. quasi_ids: A set of columns that form a composite key. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # This sample also uses threading.Event() to wait for the job to finish. import threading # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { 'project_id': table_project_id, 'dataset_id': dataset_id, 'table_id': table_id } # Convert quasi id list to Protobuf type def map_fields(field): return {'name': field} quasi_ids = map(map_fields, quasi_ids) # Tell the API where to send a notification when the job is complete. actions = [{ 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} }] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { 'privacy_metric': { 'l_diversity_config': { 'quasi_ids': quasi_ids, 'sensitive_attribute': { 'name': sensitive_attribute } } }, 'source_table': source_table, 'actions': actions } # Call API to start risk analysis job operation = dlp.create_dlp_job(parent, risk_job=risk_job) # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( project, subscription_id) subscription = subscriber.subscribe(subscription_path) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. job_done = threading.Event() # Create helper function for unpacking values def get_values(obj): return int(obj.integer_value) def callback(message): try: if (message.attributes['DlpJobName'] == operation.name): # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) histogram_buckets = ( job.risk_details .l_diversity_result .sensitive_value_frequency_histogram_buckets) # Print bucket stats for i, bucket in enumerate(histogram_buckets): print('Bucket {}:'.format(i)) print(' Bucket size range: [{}, {}]'.format( bucket.sensitive_value_frequency_lower_bound, bucket.sensitive_value_frequency_upper_bound)) for value_bucket in bucket.bucket_values: print(' Quasi-ID values: {}'.format( map(get_values, value_bucket.quasi_ids_values))) print(' Class size: {}'.format( value_bucket.equivalence_class_size)) for value in value_bucket.top_sensitive_values: print((' Sensitive value {} occurs {} time(s)' .format(value.value, value.count))) # Signal to the main thread that we can exit. job_done.set() else: # This is not the message we're looking for. message.drop() except Exception as e: # Because this is executing in a thread, an exception won't be # noted unless we print it manually. print(e) raise # Register the callback and wait on the event. subscription.open(callback) finished = job_done.wait(timeout=timeout) if not finished: print('No event received before the timeout. Please verify that the ' 'subscription provided is subscribed to the topic provided.')
def inspect_bigquery(project, bigquery_project, dataset_id, table_id, topic_id, subscription_id, info_types, custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, timeout=300): """Uses the Data Loss Prevention API to analyze BigQuery data. Args: project: The Google Cloud project id to use as a parent resource. bigquery_project: The Google Cloud project id of the target table. dataset_id: The id of the target BigQuery dataset. table_id: The id of the target BigQuery table. topic_id: The id of the Cloud Pub/Sub topic to which the API will broadcast job completion. The topic must already exist. subscription_id: The id of the Cloud Pub/Sub subscription to listen on while waiting for job completion. The subscription must already exist and be subscribed to the topic. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. namespace_id: The namespace of the Datastore document, if applicable. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # This sample also uses threading.Event() to wait for the job to finish. import threading # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { 'word_list': {'words': custom_dict.split(',')} } } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } # Construct a storage_config containing the target Bigquery info. storage_config = { 'big_query_options': { 'table_reference': { 'project_id': bigquery_project, 'dataset_id': dataset_id, 'table_id': table_id, } } } # Convert the project id into a full resource id. parent = dlp.project_path(project) # Tell the API where to send a notification when the job is complete. actions = [{ 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} }] # Construct the inspect_job, which defines the entire inspect content task. inspect_job = { 'inspect_config': inspect_config, 'storage_config': storage_config, 'actions': actions, } operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( project, subscription_id) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. job_done = threading.Event() def callback(message): try: if (message.attributes['DlpJobName'] == operation.name): # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) if job.inspect_details.result.info_type_stats: for finding in job.inspect_details.result.info_type_stats: print('Info type: {}; Count: {}'.format( finding.info_type.name, finding.count)) else: print('No findings.') # Signal to the main thread that we can exit. job_done.set() else: # This is not the message we're looking for. message.drop() except Exception as e: # Because this is executing in a thread, an exception won't be # noted unless we print it manually. print(e) raise # Register the callback and wait on the event. subscriber.subscribe(subscription_path, callback=callback) finished = job_done.wait(timeout=timeout) if not finished: print('No event received before the timeout. Please verify that the ' 'subscription provided is subscribed to the topic provided.')
def inspect_bigquery(project, bigquery_project, dataset_id, table_id, info_types, cscc=False, output_project=None, output_dataset_id=None, output_table_id=None, topic_id=None, subscription_id=None, custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, timeout=300): """Uses the Data Loss Prevention API to analyze BigQuery data. Args: project: The Google Cloud project id to use as a parent resource. bigquery_project: The Google Cloud project id of the target table. dataset_id: The id of the target BigQuery dataset. table_id: The id of the target BigQuery table. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. cscc: Should the job publish findings to Cloud Security Command Center. Default is False. output_project: The Google Cloud project id of the output table. output_dataset_id: The id of the output BigQuery dataset. output_table_id: The id of the output BigQuery table. topic_id: The id of the Cloud Pub/Sub topic to which the API will broadcast job completion. The topic must already exist. subscription_id: The id of the Cloud Pub/Sub subscription to listen on while waiting for job completion. The subscription must already exist and be subscribed to the topic. namespace_id: The namespace of the Datastore document, if applicable. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # This sample also uses threading.Event() to wait for the job to finish. import threading # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: info_types = [{'name': 'ALL_BASIC'}] else: info_types = [{'name': info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { 'word_list': {'words': custom_dict.split(',')} } } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } # Construct a storage_config containing the target Bigquery info. storage_config = { 'big_query_options': { 'table_reference': { 'project_id': bigquery_project, 'dataset_id': dataset_id, 'table_id': table_id, } } } # Convert the project id into a full resource id. parent = dlp.project_path(project) # Tell the API where to send findings to. actions = [] if cscc: actions.append({ 'publish_summary_to_cscc': {} }) if output_project and output_dataset_id and output_table_id: actions.append({ 'save_findings': { 'output_config': { 'table': { 'project_id': output_project, 'dataset_id': output_dataset_id, 'table_id': output_table_id } } } }) # Construct the inspect_job, which defines the entire inspect content task. inspect_job = { 'inspect_config': inspect_config, 'storage_config': storage_config, 'actions': actions } dlp.create_dlp_job(parent, inspect_job=inspect_job) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. job_done = threading.Event() finished = job_done.wait(timeout=timeout) if not finished: print('No event received before the timeout. Please verify that the ' 'subscription provided is subscribed to the topic provided.')
def k_map_estimate_analysis( project, table_project_id, dataset_id, table_id, topic_id, subscription_id, quasi_ids, info_types, region_code="US", timeout=300, ): """Uses the Data Loss Prevention API to compute the k-map risk estimation of a column set in a Google BigQuery table. Args: project: The Google Cloud project id to use as a parent resource. table_project_id: The Google Cloud project id where the BigQuery table is stored. dataset_id: The id of the dataset to inspect. table_id: The id of the table to inspect. column_name: The name of the column to compute risk metrics for. topic_id: The name of the Pub/Sub topic to notify once the job completes. subscription_id: The name of the Pub/Sub subscription to use when listening for job completion notifications. quasi_ids: A set of columns that form a composite key and optionally their reidentification distributions. info_types: Type of information of the quasi_id in order to provide a statistical model of population. region_code: The ISO 3166-1 region code that the data is representative of. Can be omitted if using a region-specific infoType (such as US_ZIP_5) timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # Create helper function for unpacking values def get_values(obj): return int(obj.integer_value) def callback(message): if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) histogram_buckets = ( job.risk_details.k_map_estimation_result.k_map_estimation_histogram ) # Print bucket stats for i, bucket in enumerate(histogram_buckets): print("Bucket {}:".format(i)) print( " Anonymity range: [{}, {}]".format( bucket.min_anonymity, bucket.max_anonymity ) ) print(" Size: {}".format(bucket.bucket_size)) for value_bucket in bucket.bucket_values: print( " Values: {}".format( map(get_values, value_bucket.quasi_ids_values) ) ) print( " Estimated k-map anonymity: {}".format( value_bucket.estimated_anonymity ) ) subscription.set_result(None) else: # This is not the message we're looking for. message.drop() # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { "project_id": table_project_id, "dataset_id": dataset_id, "table_id": table_id, } # Check that numbers of quasi-ids and info types are equal if len(quasi_ids) != len(info_types): raise ValueError( """Number of infoTypes and number of quasi-identifiers must be equal!""" ) # Convert quasi id list to Protobuf type def map_fields(quasi_id, info_type): return {"field": {"name": quasi_id}, "info_type": {"name": info_type}} quasi_ids = map(map_fields, quasi_ids, info_types) # Tell the API where to send a notification when the job is complete. actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { "privacy_metric": { "k_map_estimation_config": { "quasi_ids": quasi_ids, "region_code": region_code, } }, "source_table": source_table, "actions": actions, } # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path(project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) # Call API to start risk analysis job operation = dlp.create_dlp_job(parent, risk_job=risk_job) try: subscription.result(timeout=timeout) except TimeoutError: print( "No event received before the timeout. Please verify that the " "subscription provided is subscribed to the topic provided." ) subscription.close()
def l_diversity_analysis( project, table_project_id, dataset_id, table_id, topic_id, subscription_id, sensitive_attribute, quasi_ids, timeout=300, ): """Uses the Data Loss Prevention API to compute the l-diversity of a column set in a Google BigQuery table. Args: project: The Google Cloud project id to use as a parent resource. table_project_id: The Google Cloud project id where the BigQuery table is stored. dataset_id: The id of the dataset to inspect. table_id: The id of the table to inspect. topic_id: The name of the Pub/Sub topic to notify once the job completes. subscription_id: The name of the Pub/Sub subscription to use when listening for job completion notifications. sensitive_attribute: The column to measure l-diversity relative to. quasi_ids: A set of columns that form a composite key. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # Create helper function for unpacking values def get_values(obj): return int(obj.integer_value) def callback(message): if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) histogram_buckets = ( job.risk_details.l_diversity_result.sensitive_value_frequency_histogram_buckets ) # Print bucket stats for i, bucket in enumerate(histogram_buckets): print("Bucket {}:".format(i)) print( " Bucket size range: [{}, {}]".format( bucket.sensitive_value_frequency_lower_bound, bucket.sensitive_value_frequency_upper_bound, ) ) for value_bucket in bucket.bucket_values: print( " Quasi-ID values: {}".format( map(get_values, value_bucket.quasi_ids_values) ) ) print( " Class size: {}".format(value_bucket.equivalence_class_size) ) for value in value_bucket.top_sensitive_values: print( ( " Sensitive value {} occurs {} time(s)".format( value.value, value.count ) ) ) subscription.set_result(None) else: # This is not the message we're looking for. message.drop() # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { "project_id": table_project_id, "dataset_id": dataset_id, "table_id": table_id, } # Convert quasi id list to Protobuf type def map_fields(field): return {"name": field} quasi_ids = map(map_fields, quasi_ids) # Tell the API where to send a notification when the job is complete. actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { "privacy_metric": { "l_diversity_config": { "quasi_ids": quasi_ids, "sensitive_attribute": {"name": sensitive_attribute}, } }, "source_table": source_table, "actions": actions, } # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path(project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) # Call API to start risk analysis job operation = dlp.create_dlp_job(parent, risk_job=risk_job) try: subscription.result(timeout=timeout) except TimeoutError: print( "No event received before the timeout. Please verify that the " "subscription provided is subscribed to the topic provided." ) subscription.close()
def numerical_risk_analysis( project, table_project_id, dataset_id, table_id, column_name, topic_id, subscription_id, timeout=300, ): """Uses the Data Loss Prevention API to compute risk metrics of a column of numerical data in a Google BigQuery table. Args: project: The Google Cloud project id to use as a parent resource. table_project_id: The Google Cloud project id where the BigQuery table is stored. dataset_id: The id of the dataset to inspect. table_id: The id of the table to inspect. column_name: The name of the column to compute risk metrics for. topic_id: The name of the Pub/Sub topic to notify once the job completes. subscription_id: The name of the Pub/Sub subscription to use when listening for job completion notifications. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub def callback(message): if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) results = job.risk_details.numerical_stats_result print( "Value Range: [{}, {}]".format( results.min_value.integer_value, results.max_value.integer_value ) ) prev_value = None for percent, result in enumerate(results.quantile_values): value = result.integer_value if prev_value != value: print("Value at {}% quantile: {}".format(percent, value)) prev_value = value subscription.set_result(None) else: # This is not the message we're looking for. message.drop() # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { "project_id": table_project_id, "dataset_id": dataset_id, "table_id": table_id, } # Tell the API where to send a notification when the job is complete. actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { "privacy_metric": {"numerical_stats_config": {"field": {"name": column_name}}}, "source_table": source_table, "actions": actions, } # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path(project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) # Call API to start risk analysis job operation = dlp.create_dlp_job(parent, risk_job=risk_job) try: subscription.result(timeout=timeout) except TimeoutError: print( "No event received before the timeout. Please verify that the " "subscription provided is subscribed to the topic provided." ) subscription.close()
def categorical_risk_analysis( project, table_project_id, dataset_id, table_id, column_name, topic_id, subscription_id, timeout=300, ): """Uses the Data Loss Prevention API to compute risk metrics of a column of categorical data in a Google BigQuery table. Args: project: The Google Cloud project id to use as a parent resource. table_project_id: The Google Cloud project id where the BigQuery table is stored. dataset_id: The id of the dataset to inspect. table_id: The id of the table to inspect. column_name: The name of the column to compute risk metrics for. topic_id: The name of the Pub/Sub topic to notify once the job completes. subscription_id: The name of the Pub/Sub subscription to use when listening for job completion notifications. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into full resource ids. topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) parent = f"projects/{project}/locations/global" # Location info of the BigQuery table. source_table = { "project_id": table_project_id, "dataset_id": dataset_id, "table_id": table_id, } # Tell the API where to send a notification when the job is complete. actions = [{"pub_sub": {"topic": topic}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { "privacy_metric": { "categorical_stats_config": { "field": { "name": column_name } } }, "source_table": source_table, "actions": actions, } # Call API to start risk analysis job operation = dlp.create_dlp_job(request={ "parent": parent, "risk_job": risk_job }) def callback(message): if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(request={"name": operation.name}) histogram_buckets = ( job.risk_details.categorical_stats_result. value_frequency_histogram_buckets # noqa: E501 ) # Print bucket stats for i, bucket in enumerate(histogram_buckets): print("Bucket {}:".format(i)) print(" Most common value occurs {} time(s)".format( bucket.value_frequency_upper_bound)) print(" Least common value occurs {} time(s)".format( bucket.value_frequency_lower_bound)) print(" {} unique values total.".format(bucket.bucket_size)) for value in bucket.bucket_values: print(" Value {} occurs {} time(s)".format( value.value.integer_value, value.count)) subscription.set_result(None) else: # This is not the message we're looking for. message.drop() # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path(project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) try: subscription.result(timeout=timeout) except TimeoutError: print("No event received before the timeout. Please verify that the " "subscription provided is subscribed to the topic provided.") subscription.close()
def dlp(request): from google.cloud import bigquery import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="C:\gcp_credentials\elaborate-howl-285701-105c2e8355a8.json" client_bigquery = bigquery.Client()#bigquery client import uuid import google.cloud.dlp import time uuid=str(uuid.uuid4()) print(uuid) request_json = request.get_json()#json message received from http request if request_json: file_name=request_json["file_name"] print(file_name) #query of creating table start query=""" create table `elaborate-howl-285701.context.{uuid}_dlp` as SELECT * FROM `elaborate-howl-285701.context.form_key_pair` where file_name=\"{file_name}\"; """.format(uuid=uuid,file_name=file_name) #query of creating table end job_config = bigquery.QueryJobConfig() query_job = client_bigquery.query(query, location="US", job_config=job_config) query_job.result() #dlp work start project='elaborate-howl-285701' bigquery_project='elaborate-howl-285701' dataset_id='context' table_id=uuid+'_dlp' min_likelihood=None, max_findings=None, parent = f"projects/{project}/locations/global" inspect_job_data = { 'storage_config': { 'big_query_options': { 'table_reference': { 'project_id': bigquery_project, 'dataset_id': dataset_id, 'table_id': table_id }, 'identifying_fields':[ { 'name':'file_name', } ], 'excluded_fields':[ { 'name':'field_name', 'name':'time_stamp', 'name':'validated_field_name', 'name':'validated_field_value', 'name':'updated_date', 'name':'confidence', 'name':'updated_by', 'name':'key_x1', 'name':'key_x2', 'name':'key_y1', 'name':'key_y2', 'name':'value_x1', 'name':'value_x2', 'name':'value_y1', 'name':'value_y2', 'name':'pageNumber', 'name':'id', 'name':'type' } ], 'rows_limit':10000, 'sample_method':'TOP', }, }, 'inspect_config': { 'info_types': [{'name': 'FIRST_NAME'}, {'name': 'LAST_NAME'}, {'name': 'EMAIL_ADDRESS'},{'name': 'AGE'}, {'name': 'CREDIT_CARD_NUMBER'}, {'name': 'DATE'},{'name': 'DATE_OF_BIRTH'}, {'name': 'DOMAIN_NAME'}, {'name': 'EMAIL_ADDRESS'}, {'name': 'US_EMPLOYER_IDENTIFICATION_NUMBER'}, {'name': 'US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER'},{'name': 'US_PREPARER_TAXPAYER_IDENTIFICATION_NUMBER'}, {'name': 'US_SOCIAL_SECURITY_NUMBER'}, {'name': 'US_VEHICLE_IDENTIFICATION_NUMBER'}, {'name': 'US_TOLLFREE_PHONE_NUMBER'}, {'name': 'US_STATE'}, {'name': 'US_PASSPORT'},{'name': 'US_HEALTHCARE_NPI'}, {'name': 'GENDER'}, {'name': 'LOCATION'}, {'name': 'PASSPORT'}, {'name': 'PASSWORD'}, {'name': 'PHONE_NUMBER'}, {'name': 'STREET_ADDRESS'},{'name': 'URL'}, {'name': 'US_BANK_ROUTING_MICR'}, {'name': 'US_DEA_NUMBER'},{'name': 'US_DRIVERS_LICENSE_NUMBER'}], "include_quote": True, "min_likelihood": 2, }, 'actions': [ { 'save_findings': { 'output_config':{ 'table':{ 'project_id': bigquery_project, 'dataset_id': dataset_id, 'table_id': '{}_job'.format(table_id) } } }, }, ] } dlp = google.cloud.dlp_v2.DlpServiceClient() operation = dlp.create_dlp_job(parent=parent, inspect_job=inspect_job_data) time.sleep(200) #dlp work end #query for dropping created table query2=""" drop table `elaborate-howl-285701.context.{table_id}`; """.format(table_id=table_id) #query of creating table end job_config = bigquery.QueryJobConfig() query_job2 = client_bigquery.query(query2, location="US", job_config=job_config) query_job2.result() #checking rows in form_key_pair table destination_table = client_bigquery.get_table('elaborate-howl-285701.context.form_key_pair_dlp') # Make an API request. print("before insertion {} rows.".format(destination_table.num_rows)) #copy data loss prevention on desired form_key_pair_dlp query3=""" INSERT INTO `elaborate-howl-285701.context.form_key_pair_dlp` SELECT * FROM `elaborate-howl-285701.context.{tableid2}_job` """.format(tableid2=table_id) print(query3) #query of creating table end job_config = bigquery.QueryJobConfig() query_job3 = client_bigquery.query(query3, location="US", job_config=job_config) query_job3.result() #time.sleep(30) #checking rows in form_key_pair table destination_table = client_bigquery.get_table('elaborate-howl-285701.context.form_key_pair_dlp') # Make an API request. print("after insertion {} rows.".format(destination_table.num_rows)) job = dlp.get_dlp_job(request={"name": operation.name}) result_count="" if job.inspect_details.result.info_type_stats: for finding in job.inspect_details.result.info_type_stats: result_="Info type: {}; Count: {}".format(finding.info_type.name, finding.count) result_count=result_+result_count+'\n' print(result_count) #query for dropping dlp table query4=""" drop table `elaborate-howl-285701.context.{table_id2}_job`; """.format(table_id2=table_id) #query of creating table end job_config = bigquery.QueryJobConfig() query_job4 = client_bigquery.query(query4, location="US", job_config=job_config) query_job4.result() ## work for neo4j starts query5 = """ select distinct a.field_value, a.field_name, b.info_type.name as info_, b.likelihood from `elaborate-howl-285701.context.form_key_pair` a, `elaborate-howl-285701.context.form_key_pair_dlp` b where a.file_name=\"{file_name}\" and lower(a.field_value)=lower(b.quote); """.format(file_name=file_name) query_job5 = client_bigquery.query( query5, # Location must match that of the dataset(s) referenced in the query. location="US", ) # API request - starts the query df = query_job5.to_dataframe() f_value=[] for a in df.field_value: f_value.append(a) f_name=[] for b in df.field_name: f_name.append(b) info_name=[] for c in df.info_: info_name.append(c) from neo4j import GraphDatabase import logging from neo4j.exceptions import ServiceUnavailable class App: def __init__(self, uri, user, password): self.driver = GraphDatabase.driver(uri, auth=(user, password)) def close(self): # Don't forget to close the driver connection when you are finished with it self.driver.close() def create_friendship(self,file_name,field_value,field_name,info_): with self.driver.session() as session: # Write transactions allow the driver to handle retries and transient errors result = session.write_transaction( self._create_and_return_friendship, file_name, field_value,field_name,info_) print(result) #for row in result: # print("Created relation between: {n}, {m} ".format(n=row['n'], m=row['m'])) # print("Created relation between: {n}, {e} ".format(n=row['n'], e=row['e'])) # print("Created relation between: {e}, {m} ".format(e=row['e'], m=row['m'])) # print("Created relation between: {m}, {w} ".format(m=row['m'], w=row['w'])) @staticmethod def _create_and_return_friendship(tx, file_name, field_value,field_name,info_): # To learn more about the Cypher syntax, see https://neo4j.com/docs/cypher-manual/current/ # The Reference Card is also a good resource for keywords https://neo4j.com/docs/cypher-refcard/current/ query = """ merge (n:File {Name: $file_name}) merge (m:FIELD {Name: $field_name}) merge (e:VALUE {value: $field_value}) merge (w:DLP_Classification {NAME: $info_}) merge (n)-[p:CONTAINS_FIELD]->(m) merge (n)-[q:CONTAINS_VALUE]->(e) merge (e)-[r:TYPE_IS]->(m) merge (m)-[s:DATA_Classification]->(w) RETURN n, m, e, w, p, q, r, s """ result = tx.run(query, file_name=file_name, field_value=field_value,field_name=field_name,info_=info_) try: return [{"n": row["n"]["name"], "e": row["e"]["address"]} for row in result] # Capture any errors along with the query and data for traceability except ServiceUnavailable as exception: logging.error("{query} raised an error: \n {exception}".format( query=query, exception=exception)) raise import itertools for (a,b,c) in zip(f_value,f_name,info_name): print(a+','+b+','+c) bolt_url = "neo4j+s://cfb079ca.databases.neo4j.io" user = "******" password = "******" app = App(bolt_url, user, password) app.create_friendship(file_name, a,b,c) app.close() return "df"
def create_DLP_job(data, done): """This function is triggered by new files uploaded to the designated Cloud Storage quarantine/staging bucket. It creates a dlp job for the uploaded file. Arg: data: The Cloud Storage Event Returns: None. Debug information is printed to the log. """ # Get the targeted file in the quarantine bucket file_name = data['name'] print('Function triggered for file [{}]'.format(file_name)) # Prepare info_types by converting the list of strings (INFO_TYPES) into a list of dictionaries info_types = [{'name': info_type} for info_type in INFO_TYPES] # Convert the project id into a full resource id. parent = dlp.project_path(PROJECT_ID) # Construct the configuration dictionary. inspect_job = { 'inspect_config': { 'info_types': info_types, 'min_likelihood': MIN_LIKELIHOOD, 'limits': { 'max_findings_per_request': MAX_FINDINGS }, }, 'storage_config': { 'cloud_storage_options': { 'file_set': { 'url': 'gs://{bucket_name}/{file_name}'.format( bucket_name=STAGING_BUCKET, file_name=file_name) } } }, 'actions': [{ 'pub_sub': { 'topic': 'projects/{project_id}/topics/{topic_id}'.format( project_id=PROJECT_ID, topic_id=PUB_SUB_TOPIC) } }, { 'save_findings': { 'output_config': { 'table': { 'project_id': PROJECT_ID, 'dataset_id': DATASET_ID, 'table_id': TABLE_ID } } } }, { 'publish_summary_to_cscc': {} }] } # Create the DLP job and let the DLP api processes it. try: dlp.create_dlp_job(parent, inspect_job) print('Job created by create_DLP_job') except Exception as e: print(e)
def numerical_risk_analysis(project, table_project_id, dataset_id, table_id, column_name, topic_id, subscription_id, timeout=300): """Uses the Data Loss Prevention API to compute risk metrics of a column of numerical data in a Google BigQuery table. Args: project: The Google Cloud project id to use as a parent resource. table_project_id: The Google Cloud project id where the BigQuery table is stored. dataset_id: The id of the dataset to inspect. table_id: The id of the table to inspect. column_name: The name of the column to compute risk metrics for. topic_id: The name of the Pub/Sub topic to notify once the job completes. subscription_id: The name of the Pub/Sub subscription to use when listening for job completion notifications. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # This sample also uses threading.Event() to wait for the job to finish. import threading # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { 'project_id': table_project_id, 'dataset_id': dataset_id, 'table_id': table_id } # Tell the API where to send a notification when the job is complete. actions = [{ 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} }] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { 'privacy_metric': { 'numerical_stats_config': { 'field': { 'name': column_name } } }, 'source_table': source_table, 'actions': actions } # Call API to start risk analysis job operation = dlp.create_dlp_job(parent, risk_job=risk_job) # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( project, subscription_id) subscription = subscriber.subscribe(subscription_path) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. job_done = threading.Event() def callback(message): try: if (message.attributes['DlpJobName'] == operation.name): # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) results = job.risk_details.numerical_stats_result print('Value Range: [{}, {}]'.format( results.min_value.integer_value, results.max_value.integer_value)) prev_value = None for percent, result in enumerate(results.quantile_values): value = result.integer_value if prev_value != value: print('Value at {}% quantile: {}'.format( percent, value)) prev_value = value # Signal to the main thread that we can exit. job_done.set() else: # This is not the message we're looking for. message.drop() except Exception as e: # Because this is executing in a thread, an exception won't be # noted unless we print it manually. print(e) raise # Register the callback and wait on the event. subscription.open(callback) finished = job_done.wait(timeout=timeout) if not finished: print('No event received before the timeout. Please verify that the ' 'subscription provided is subscribed to the topic provided.')
def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, info_types, custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, timeout=300): """Uses the Data Loss Prevention API to analyze a file on GCS. Args: project: The Google Cloud project id to use as a parent resource. bucket: The name of the GCS bucket containing the file, as a string. filename: The name of the file in the bucket, including the path, as a string; e.g. 'images/myfile.png'. topic_id: The id of the Cloud Pub/Sub topic to which the API will broadcast job completion. The topic must already exist. subscription_id: The id of the Cloud Pub/Sub subscription to listen on while waiting for job completion. The subscription must already exist and be subscribed to the topic. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # This sample also uses threading.Event() to wait for the job to finish. import threading # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [{ 'info_type': { 'name': 'CUSTOM_DICTIONARY_{}'.format(i) }, 'dictionary': { 'word_list': { 'words': custom_dict.split(',') } } } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': { 'name': 'CUSTOM_REGEX_{}'.format(i) }, 'regex': { 'pattern': custom_regex } } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': { 'max_findings_per_request': max_findings }, } # Construct a storage_config containing the file's URL. url = 'gs://{}/{}'.format(bucket, filename) storage_config = {'cloud_storage_options': {'file_set': {'url': url}}} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Tell the API where to send a notification when the job is complete. actions = [{'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}}] # Construct the inspect_job, which defines the entire inspect content task. inspect_job = { 'inspect_config': inspect_config, 'storage_config': storage_config, 'actions': actions, } operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path(project, subscription_id) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. job_done = threading.Event() def callback(message): try: if (message.attributes['DlpJobName'] == operation.name): # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) if job.inspect_details.result.info_type_stats: for finding in job.inspect_details.result.info_type_stats: print('Info type: {}; Count: {}'.format( finding.info_type.name, finding.count)) else: print('No findings.') # Signal to the main thread that we can exit. job_done.set() else: # This is not the message we're looking for. message.drop() except Exception as e: # Because this is executing in a thread, an exception won't be # noted unless we print it manually. print(e) raise subscriber.subscribe(subscription_path, callback=callback) finished = job_done.wait(timeout=timeout) if not finished: print('No event received before the timeout. Please verify that the ' 'subscription provided is subscribed to the topic provided.')
def categorical_risk_analysis(project, table_project_id, dataset_id, table_id, column_name, topic_id, subscription_id, timeout=300): """Uses the Data Loss Prevention API to compute risk metrics of a column of categorical data in a Google BigQuery table. Args: project: The Google Cloud project id to use as a parent resource. table_project_id: The Google Cloud project id where the BigQuery table is stored. dataset_id: The id of the dataset to inspect. table_id: The id of the table to inspect. column_name: The name of the column to compute risk metrics for. topic_id: The name of the Pub/Sub topic to notify once the job completes. subscription_id: The name of the Pub/Sub subscription to use when listening for job completion notifications. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub def callback(message): if (message.attributes['DlpJobName'] == operation.name): # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) histogram_buckets = (job.risk_details .categorical_stats_result .value_frequency_histogram_buckets) # Print bucket stats for i, bucket in enumerate(histogram_buckets): print('Bucket {}:'.format(i)) print(' Most common value occurs {} time(s)'.format( bucket.value_frequency_upper_bound)) print(' Least common value occurs {} time(s)'.format( bucket.value_frequency_lower_bound)) print(' {} unique values total.'.format( bucket.bucket_size)) for value in bucket.bucket_values: print(' Value {} occurs {} time(s)'.format( value.value.integer_value, value.count)) subscription.set_result(None) else: # This is not the message we're looking for. message.drop() # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { 'project_id': table_project_id, 'dataset_id': dataset_id, 'table_id': table_id } # Tell the API where to send a notification when the job is complete. actions = [{ 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} }] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { 'privacy_metric': { 'categorical_stats_config': { 'field': { 'name': column_name } } }, 'source_table': source_table, 'actions': actions } # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) # Call API to start risk analysis job operation = dlp.create_dlp_job(parent, risk_job=risk_job) try: subscription.result(timeout=timeout) except TimeoutError: print('No event received before the timeout. Please verify that the ' 'subscription provided is subscribed to the topic provided.') subscription.close()
def k_anonymity_analysis(project, table_project_id, dataset_id, table_id, topic_id, subscription_id, quasi_ids, timeout=300): """Uses the Data Loss Prevention API to compute the k-anonymity of a column set in a Google BigQuery table. Args: project: The Google Cloud project id to use as a parent resource. table_project_id: The Google Cloud project id where the BigQuery table is stored. dataset_id: The id of the dataset to inspect. table_id: The id of the table to inspect. topic_id: The name of the Pub/Sub topic to notify once the job completes. subscription_id: The name of the Pub/Sub subscription to use when listening for job completion notifications. quasi_ids: A set of columns that form a composite key. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # Create helper function for unpacking values def get_values(obj): return int(obj.integer_value) def callback(message): if (message.attributes['DlpJobName'] == operation.name): # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) histogram_buckets = (job.risk_details .k_anonymity_result .equivalence_class_histogram_buckets) # Print bucket stats for i, bucket in enumerate(histogram_buckets): print('Bucket {}:'.format(i)) if bucket.equivalence_class_size_lower_bound: print(' Bucket size range: [{}, {}]'.format( bucket.equivalence_class_size_lower_bound, bucket.equivalence_class_size_upper_bound)) for value_bucket in bucket.bucket_values: print(' Quasi-ID values: {}'.format( map(get_values, value_bucket.quasi_ids_values) )) print(' Class size: {}'.format( value_bucket.equivalence_class_size)) subscription.set_result(None) else: # This is not the message we're looking for. message.drop() # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { 'project_id': table_project_id, 'dataset_id': dataset_id, 'table_id': table_id } # Convert quasi id list to Protobuf type def map_fields(field): return {'name': field} quasi_ids = map(map_fields, quasi_ids) # Tell the API where to send a notification when the job is complete. actions = [{ 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} }] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { 'privacy_metric': { 'k_anonymity_config': { 'quasi_ids': quasi_ids } }, 'source_table': source_table, 'actions': actions } # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) # Call API to start risk analysis job operation = dlp.create_dlp_job(parent, risk_job=risk_job) try: subscription.result(timeout=timeout) except TimeoutError: print('No event received before the timeout. Please verify that the ' 'subscription provided is subscribed to the topic provided.') subscription.close()
def inspect_bigquery( project, bigquery_project, dataset_id, table_id, topic_id, subscription_id, info_types, custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, timeout=300, ): """Uses the Data Loss Prevention API to analyze BigQuery data. Args: project: The Google Cloud project id to use as a parent resource. bigquery_project: The Google Cloud project id of the target table. dataset_id: The id of the target BigQuery dataset. table_id: The id of the target BigQuery table. topic_id: The id of the Cloud Pub/Sub topic to which the API will broadcast job completion. The topic must already exist. subscription_id: The id of the Cloud Pub/Sub subscription to listen on while waiting for job completion. The subscription must already exist and be subscribed to the topic. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. namespace_id: The namespace of the Datastore document, if applicable. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # This sample also uses threading.Event() to wait for the job to finish. import threading # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] info_types = [{"name": info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [ { "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, "dictionary": {"word_list": {"words": custom_dict.split(",")}}, } for i, custom_dict in enumerate(custom_dictionaries) ] if custom_regexes is None: custom_regexes = [] regexes = [ { "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, "regex": {"pattern": custom_regex}, } for i, custom_regex in enumerate(custom_regexes) ] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { "info_types": info_types, "custom_info_types": custom_info_types, "min_likelihood": min_likelihood, "limits": {"max_findings_per_request": max_findings}, } # Construct a storage_config containing the target Bigquery info. storage_config = { "big_query_options": { "table_reference": { "project_id": bigquery_project, "dataset_id": dataset_id, "table_id": table_id, } } } # Convert the project id into a full resource id. parent = dlp.project_path(project) # Tell the API where to send a notification when the job is complete. actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Construct the inspect_job, which defines the entire inspect content task. inspect_job = { "inspect_config": inspect_config, "storage_config": storage_config, "actions": actions, } operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path(project, subscription_id) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. job_done = threading.Event() def callback(message): try: if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) if job.inspect_details.result.info_type_stats: for finding in job.inspect_details.result.info_type_stats: print( "Info type: {}; Count: {}".format( finding.info_type.name, finding.count ) ) else: print("No findings.") # Signal to the main thread that we can exit. job_done.set() else: # This is not the message we're looking for. message.drop() except Exception as e: # Because this is executing in a thread, an exception won't be # noted unless we print it manually. print(e) raise # Register the callback and wait on the event. subscriber.subscribe(subscription_path, callback=callback) finished = job_done.wait(timeout=timeout) if not finished: print( "No event received before the timeout. Please verify that the " "subscription provided is subscribed to the topic provided." )
def k_map_estimate_analysis(project, table_project_id, dataset_id, table_id, topic_id, subscription_id, quasi_ids, info_types, region_code='US', timeout=300): """Uses the Data Loss Prevention API to compute the k-map risk estimation of a column set in a Google BigQuery table. Args: project: The Google Cloud project id to use as a parent resource. table_project_id: The Google Cloud project id where the BigQuery table is stored. dataset_id: The id of the dataset to inspect. table_id: The id of the table to inspect. column_name: The name of the column to compute risk metrics for. topic_id: The name of the Pub/Sub topic to notify once the job completes. subscription_id: The name of the Pub/Sub subscription to use when listening for job completion notifications. quasi_ids: A set of columns that form a composite key and optionally their reidentification distributions. info_types: Type of information of the quasi_id in order to provide a statistical model of population. region_code: The ISO 3166-1 region code that the data is representative of. Can be omitted if using a region-specific infoType (such as US_ZIP_5) timeout: The number of seconds to wait for a response from the API. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # This sample additionally uses Cloud Pub/Sub to receive results from # potentially long-running operations. import google.cloud.pubsub # This sample also uses threading.Event() to wait for the job to finish. import threading # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { 'project_id': table_project_id, 'dataset_id': dataset_id, 'table_id': table_id } # Check that numbers of quasi-ids and info types are equal if len(quasi_ids) != len(info_types): raise ValueError("""Number of infoTypes and number of quasi-identifiers must be equal!""") # Convert quasi id list to Protobuf type def map_fields(quasi_id, info_type): return {'field': {'name': quasi_id}, 'info_type': {'name': info_type}} quasi_ids = map(map_fields, quasi_ids, info_types) # Tell the API where to send a notification when the job is complete. actions = [{ 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} }] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { 'privacy_metric': { 'k_map_estimation_config': { 'quasi_ids': quasi_ids, 'region_code': region_code } }, 'source_table': source_table, 'actions': actions } # Call API to start risk analysis job operation = dlp.create_dlp_job(parent, risk_job=risk_job) # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( project, subscription_id) subscription = subscriber.subscribe(subscription_path) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. job_done = threading.Event() # Create helper function for unpacking values def get_values(obj): return int(obj.integer_value) def callback(message): try: if (message.attributes['DlpJobName'] == operation.name): # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) histogram_buckets = (job.risk_details .k_map_estimation_result .k_map_estimation_histogram) # Print bucket stats for i, bucket in enumerate(histogram_buckets): print('Bucket {}:'.format(i)) print(' Anonymity range: [{}, {}]'.format( bucket.min_anonymity, bucket.max_anonymity)) print(' Size: {}'.format(bucket.bucket_size)) for value_bucket in bucket.bucket_values: print(' Values: {}'.format( map(get_values, value_bucket.quasi_ids_values))) print(' Estimated k-map anonymity: {}'.format( value_bucket.estimated_anonymity)) # Signal to the main thread that we can exit. job_done.set() else: # This is not the message we're looking for. message.drop() except Exception as e: # Because this is executing in a thread, an exception won't be # noted unless we print it manually. print(e) raise # Register the callback and wait on the event. subscription.open(callback) finished = job_done.wait(timeout=timeout) if not finished: print('No event received before the timeout. Please verify that the ' 'subscription provided is subscribed to the topic provided.')