def inspect_string(project, content_string, info_types, custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, include_quote=True): dlp = google.cloud.dlp.DlpServiceClient() parent = dlp.project_path(project) item = {'value': content_string} info_types = [{'name': info_types}] max_findings = 0 include_quote = True min_likelihood = 'LIKELIHOOD_UNSPECIFIED' inspect_config = { 'info_types': info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': { 'max_findings_per_request': max_findings }, } response = dlp.inspect_content(parent, inspect_config, item) return response
def inspectdata(data, project_id, template_id): dlp = google.cloud.dlp_v2.DlpServiceClient() parent = dlp.project_path(project_id) inspect_template = f"projects/{project_id}/inspectTemplates/{template_id}" response = dlp.inspect_content(parent, inspect_template_name=inspect_template, item=data) return response
def inspect_with_medical_record_number_custom_regex_detector( project, content_string, ): """Uses the Data Loss Prevention API to analyze string with medical record number custom regex detector Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Construct a custom regex detector info type called "C_MRN", # with ###-#-##### pattern, where each # represents a digit from 1 to 9. # The detector has a detection likelihood of POSSIBLE. custom_info_types = [{ "info_type": { "name": "C_MRN" }, "regex": { "pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}" }, "likelihood": "POSSIBLE", }] # Construct the configuration dictionary with the custom regex info type. inspect_config = { "custom_info_types": custom_info_types, } # Construct the `item`. item = {"value": content_string} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: if finding.quote: print(f"Quote: {finding.quote}") except AttributeError: pass print(f"Info type: {finding.info_type.name}") print(f"Likelihood: {finding.likelihood}") else: print("No findings.")
def inspect_string(item, info_types=None, min_likelihood=None, max_findings=None, include_quote=True): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: item: The string to inspect. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. If info_types is omitted, the API will use a limited default set. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. include_quote: Boolean for whether to display a quote of the detected information in the results. Returns: None; the response from the API is printed to the terminal. """ # Import the client library import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if info_types is not None: info_types = [{'name': info_type} for info_type in info_types] # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'min_likelihood': min_likelihood, 'max_findings': max_findings, 'include_quote': include_quote, } # Construct the items list (in this case, only one item, in string form). items = [{'type': 'text/plain', 'value': item}] # Call the API. response = dlp.inspect_content(inspect_config, items) # Print out the results. if response.results[0].findings: for finding in response.results[0].findings: try: print('Quote: {}'.format(finding.quote)) except AttributeError: pass print('Info type: {}'.format(finding.info_type.name)) print('Likelihood: {}'.format(finding.likelihood)) else: print('No findings.')
def inspect_string(project, content_string, info_types, min_likelihood=None, max_findings=None, include_quote=True): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. include_quote: Boolean for whether to display a quote of the detected information in the results. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). info_types = [{'name': info_type} for info_type in info_types] # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': {'max_findings_per_request': max_findings}, } # Construct the `item`. item = {'value': content_string} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: if finding.quote: print('Quote: {}'.format(finding.quote)) except AttributeError: pass print('Info type: {}'.format(finding.info_type.name)) print('Likelihood: {}'.format(finding.likelihood)) else: print('No findings.')
def inspect(text): dlp = google.cloud.dlp_v2.DlpServiceClient() parent = dlp.project_path('roi-gcp-demos') inspect_config = { "info_types": [ { "name": "EMAIL_ADDRESS" }, { "name": "CREDIT_CARD_NUMBER" }, { "name": "GENERIC_ID" }, { "name": "IP_ADDRESS" }, { "name": "PHONE_NUMBER" }, { "name": "US_DRIVERS_LICENSE_NUMBER" }, { "name": "US_SOCIAL_SECURITY_NUMBER" }, ], "include_quote": True } item = {"value": text} # Call the API response = dlp.inspect_content( parent, inspect_config=inspect_config, item=item, ) result = "" if response.result.findings: for finding in response.result.findings: try: if finding.quote: result += "Quote: {}<br>".format(finding.quote) except AttributeError: pass result += "Info type: {}<br>".format(finding.info_type.name) result += "Likelihood: {}<br>".format(finding.likelihood) result += "<br>" else: result = "No findings." return {"result": result}
def inspect_file(project, filename, info_types, custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, include_quote=True, mime_type=None): dlp = google.cloud.dlp.DlpServiceClient() # I suppose in theory these could just be templates if # I understand the concept correctly. info_types = [{'name': info_types}] inspect_config = { 'info_types': info_types, 'min_likelihood': min_likelihood, 'limits': { 'max_findings_per_request': max_findings }, 'include_quote': include_quote, } # I guess a mime_type, but I don't know why. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] supported_content_types = { None: 0, } content_type_index = supported_content_types.get(mime_type, 0) try: headers = {'Authorization': 'Bearer ' + os.environ['TOKEN']} r = requests.get(filename, headers=headers, stream=True) with open(filename.split('/')[-1], 'wb') as f: f.write(r.content) except Exception as e: return e if 'xlsx' in filename.split('/')[-1]: return 'I am unable to read this file. Please try again.' with open(filename.split('/')[-1], mode='rb') as f: item = {'byte_item': {'type': content_type_index, 'data': f.read()}} parent = dlp.project_path(project) response = dlp.inspect_content(parent, inspect_config, item) return response.result
def redact_text(data, project): logging.info(data) dlp = google.cloud.dlp_v2.DlpServiceClient() parent = dlp.project_path(project) response = dlp.list_info_types('en-US') # This will detect PII data for the info types listed # https://cloud.google.com/dlp/docs/infotypes-reference info_types = [ "PERSON_NAME", "PHONE_NUMBER", "ORGANIZATION_NAME", "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS", "DATE_OF_BIRTH", "EMAIL_ADDRESS", "US_SOCIAL_SECURITY_NUMBER", "STREET_ADDRESS" ] info_types = [{"name": info_type} for info_type in info_types] inspect_config = {"info_types": info_types, "include_quote": True} logging.info(data['transcript']) item = {"value": data['transcript']} response = dlp.inspect_content( parent, inspect_config=inspect_config, item=item, ) logging.info(response) if response.result.findings: for finding in response.result.findings: try: if finding.quote: print("Quote: {}".format(finding.quote)) data['dlp'].append(finding.quote) except AttributeError: pass else: print("No findings.") return data
def dlp_inspect(message, custom_dictionaries=None, custom_regexes=None): """Inspect a message from posted to a Slack channel using Google DLP API. If data matches identifers, post a messsage in the Original channel and IR channel.""" # Edit this with your Google Cloud Project ID. project = os.environ["GOOGLE_CLOUD_PROJECT"] # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # The text to inspect item = {'value': message["text"]} # The info types to search for in the content. Required. info_types = [{ 'name': 'US_SOCIAL_SECURITY_NUMBER' }, { 'name': 'US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER' }, { 'name': 'CANADA_SOCIAL_INSURANCE_NUMBER' }, { 'name': 'CREDIT_CARD_NUMBER' }] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = ['Test_Keyword'] dictionaries = [{ 'info_type': { 'name': 'CUSTOM_DICTIONARY_{}'.format(i) }, 'dictionary': { 'word_list': { 'words': custom_dict.split(',') } } } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': { 'name': 'CUSTOM_REGEX_{}'.format(i) }, 'regex': { 'pattern': custom_regex } } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # The minimum likelihood to constitute a match. Optional. min_likelihood = 'LIKELIHOOD_UNSPECIFIED' # The maximum number of findings to report (0 = server maximum). Optional. max_findings = 0 # Whether to include the matching string in the results. Optional. include_quote = True # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': { 'max_findings_per_request': max_findings }, } # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Send results to Slack Channels. if response.result.findings: for finding in response.result.findings: try: channel = message["channel"] # Translate the encoded channel name into it's actual name. channel_translated = CLIENT.api_call("channels.info", channel=channel) user = CLIENT.api_call("users.info", user=message["user"]) #If you have an IR channel you want to alert in. Fill it in here. Otherwise, comment it out. ir_channel = os.environ["ir_channel"] # Send a message to notify the channel where the sensitive data was found. bot_message = "The following text in your message was found to have sensitive data: `{}`. Type: `{}`.".format( finding.quote, finding.info_type.name) # Send a message to notify an Incident Response channel that sensitive data was found. If no IR channel is being used, comment it out. ir_message = "<@{}> might have posted some sensitive data in #{}. You might want to check it out.".format( user["user"]["name"], channel_translated["channel"]["name"]) # Post alert message in the channel where the data was found. CLIENT.api_call("chat.postMessage", channel=channel, text=bot_message) # Post the message in the IR Channel. If not using an IR channel, comment it out. CLIENT.api_call("chat.postMessage", channel=ir_channel, text=ir_message) except AttributeError: pass
def inspect_file(filename, info_types=None, min_likelihood=None, max_findings=None, include_quote=True, mime_type=None): """Uses the Data Loss Prevention API to analyze a file for protected data. Args: filename: The path to the file to inspect. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. If info_types is omitted, the API will use a limited default set. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. include_quote: Boolean for whether to display a quote of the detected information in the results. mime_type: The MIME type of the file. If not specified, the type is inferred via the Python standard library's mimetypes module. Returns: None; the response from the API is printed to the terminal. """ import mimetypes # Import the client library import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if info_types is not None: info_types = [{'name': info_type} for info_type in info_types] # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'min_likelihood': min_likelihood, 'max_findings': max_findings, 'include_quote': include_quote, } # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] or 'application/octet-stream' # Construct the items list (in this case, only one item, containing the # file's byte data). with open(filename, mode='rb') as f: items = [{'type': mime_type, 'data': f.read()}] # Call the API. response = dlp.inspect_content(inspect_config, items) # Print out the results. if response.results[0].findings: for finding in response.results[0].findings: try: print('Quote: {}'.format(finding.quote)) except AttributeError: pass print('Info type: {}'.format(finding.info_type.name)) print('Likelihood: {}'.format(finding.likelihood)) else: print('No findings.')
def omit_name_if_also_email( project, content_string, ): """Marches PERSON_NAME and EMAIL_ADDRESS, but not both. Uses the Data Loss Prevention API omit matches on PERSON_NAME if the EMAIL_ADDRESS detector also matches. Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Construct a list of infoTypes for DLP to locate in `content_string`. See # https://cloud.google.com/dlp/docs/concepts-infotypes for more information # about supported infoTypes. info_types_to_locate = [{"name": "PERSON_NAME"}, {"name": "EMAIL_ADDRESS"}] # Construct the configuration dictionary that will only match on PERSON_NAME # if the EMAIL_ADDRESS doesn't also match. This configuration helps reduce # the total number of findings when there is a large overlap between different # infoTypes. inspect_config = { "info_types": info_types_to_locate, "rule_set": [{ "info_types": [{ "name": "PERSON_NAME" }], "rules": [{ "exclusion_rule": { "exclude_info_types": { "info_types": [{ "name": "EMAIL_ADDRESS" }] }, "matching_type": "MATCHING_TYPE_PARTIAL_MATCH" } }] }] } # Construct the `item`. item = {"value": content_string} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) return [f.info_type.name for f in response.result.findings]
def inspect_string_custom_omit_overlap(project, content_string): """Matches PERSON_NAME and a custom detector, but if they overlap only matches the custom detector Uses the Data Loss Prevention API to omit matches on a built-in detector if they overlap with matches from a custom detector Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Construct a custom regex detector for names custom_info_types = [{ "info_type": { "name": "VIP_DETECTOR" }, "regex": { "pattern": "Larry Page|Sergey Brin" }, "exclusion_type": google.cloud.dlp_v2.CustomInfoType.ExclusionType. EXCLUSION_TYPE_EXCLUDE, }] # Construct a rule set that will exclude PERSON_NAME matches # that overlap with VIP_DETECTOR matches rule_set = [{ "info_types": [{ "name": "PERSON_NAME" }], "rules": [{ "exclusion_rule": { "exclude_info_types": { "info_types": [{ "name": "VIP_DETECTOR" }] }, "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH, } }], }] # Construct the configuration dictionary inspect_config = { "info_types": [{ "name": "PERSON_NAME" }], "custom_info_types": custom_info_types, "rule_set": rule_set, "include_quote": True, } # Construct the `item`. item = {"value": content_string} # Convert the project id into a full resource id. parent = f"projects/{project}" # Call the API. response = dlp.inspect_content(request={ "parent": parent, "inspect_config": inspect_config, "item": item }) # Print out the results. if response.result.findings: for finding in response.result.findings: print(f"Quote: {finding.quote}") print(f"Info type: {finding.info_type.name}") print(f"Likelihood: {finding.likelihood}") else: print("No findings.")
def inspect_file(project, filename, info_types, min_likelihood=None, max_findings=None, include_quote=True, mime_type=None): """Call Data Loss Prevention API to analyze a file for protected data. Args: project: The GCP project id to use as a parent resource. filename: The path to the file to inspect. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. include_quote: Boolean for whether to display a quote of the detected information in the results. mime_type: The MIME type of the file. If not specified, the type is inferred via the Python standard library's mimetypes module. Returns: None; the response from the API is printed to the terminal. """ import mimetypes # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" 'image/jpeg': 1, 'image/bmp': 2, 'image/png': 3, 'image/svg': 4, 'text/plain': 5, } content_type_index = supported_content_types.get(mime_type, 1) # Construct the item, containing the file's byte data. with open(filename, mode='rb') as f: item = {'byte_item': {'type': content_type_index, 'data': f.read()}} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: print('Quote: {}'.format(finding.quote)) except AttributeError: pass print('Info type: {}'.format(finding.info_type.name)) print('Likelihood: {}'.format(finding.likelihood)) boxes = finding.location.content_locations[0].image_location.bounding_boxes for box in boxes: #print('box: {}'.format(box)) pass return boxes else: print('No findings.')
def inspect_string(project, content_string, info_types, min_likelihood=None, max_findings=None, include_quote=True): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. include_quote: Boolean for whether to display a quote of the detected information in the results. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). info_types = [{'name': info_type} for info_type in info_types] # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': { 'max_findings_per_request': max_findings }, } # Construct the `item`. item = {'value': content_string} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: if finding.quote: print('Quote: {}'.format(finding.quote)) except AttributeError: pass print('Info type: {}'.format(finding.info_type.name)) print('Likelihood: {}'.format(finding.likelihood)) else: print('No findings.')
def inspect_with_medical_record_number_w_custom_hotwords( project, content_string, ): """Uses the Data Loss Prevention API to analyze string with medical record number custom regex detector, with custom hotwords rules to boost finding certainty under some circumstances. Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Construct a custom regex detector info type called "C_MRN", # with ###-#-##### pattern, where each # represents a digit from 1 to 9. # The detector has a detection likelihood of POSSIBLE. custom_info_types = [{ "info_type": { "name": "C_MRN" }, "regex": { "pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}" }, "likelihood": google.cloud.dlp_v2.Likelihood.POSSIBLE, }] # Construct a rule set with hotwords "mrn" and "medical", with a likelohood # boost to VERY_LIKELY when hotwords are present within the 10 character- # window preceding the PII finding. hotword_rule = { "hotword_regex": { "pattern": "(?i)(mrn|medical)(?-i)" }, "likelihood_adjustment": { "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY }, "proximity": { "window_before": 10 }, } rule_set = [{ "info_types": [{ "name": "C_MRN" }], "rules": [{ "hotword_rule": hotword_rule }] }] # Construct the configuration dictionary with the custom regex info type. inspect_config = { "custom_info_types": custom_info_types, "rule_set": rule_set, "include_quote": True, } # Construct the `item`. item = {"value": content_string} # Convert the project id into a full resource id. parent = f"projects/{project}" # Call the API. response = dlp.inspect_content(request={ "parent": parent, "inspect_config": inspect_config, "item": item }) # Print out the results. if response.result.findings: for finding in response.result.findings: print(f"Quote: {finding.quote}") print(f"Info type: {finding.info_type.name}") print(f"Likelihood: {finding.likelihood}") else: print("No findings.")
def inspect_string_multiple_rules(project, content_string): """Uses the Data Loss Prevention API to modify likelihood for matches on PERSON_NAME combining multiple hotword and exclusion rules. Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Construct hotword rules patient_rule = { "hotword_regex": { "pattern": "patient" }, "proximity": { "window_before": 10 }, "likelihood_adjustment": { "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY }, } doctor_rule = { "hotword_regex": { "pattern": "doctor" }, "proximity": { "window_before": 10 }, "likelihood_adjustment": { "fixed_likelihood": google.cloud.dlp_v2.Likelihood.UNLIKELY }, } # Construct exclusion rules quasimodo_rule = { "dictionary": { "word_list": { "words": ["quasimodo"] }, }, "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, } redacted_rule = { "regex": { "pattern": "REDACTED" }, "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, } # Construct the rule set, combining the above rules rule_set = [{ "info_types": [{ "name": "PERSON_NAME" }], "rules": [ { "hotword_rule": patient_rule }, { "hotword_rule": doctor_rule }, { "exclusion_rule": quasimodo_rule }, { "exclusion_rule": redacted_rule }, ], }] # Construct the configuration dictionary inspect_config = { "info_types": [{ "name": "PERSON_NAME" }], "rule_set": rule_set, "include_quote": True, } # Construct the `item`. item = {"value": content_string} # Convert the project id into a full resource id. parent = f"projects/{project}" # Call the API. response = dlp.inspect_content(request={ "parent": parent, "inspect_config": inspect_config, "item": item }) # Print out the results. if response.result.findings: for finding in response.result.findings: print(f"Quote: {finding.quote}") print(f"Info type: {finding.info_type.name}") print(f"Likelihood: {finding.likelihood}") else: print("No findings.")
def inspect_with_person_name_w_custom_hotword(project, content_string, custom_hotword="patient"): """Uses the Data Loss Prevention API increase likelihood for matches on PERSON_NAME if the user specified custom hotword is present. Only includes findings with the increased likelihood by setting a minimum likelihood threshold of VERY_LIKELY. Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. custom_hotword: The custom hotword used for likelihood boosting. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Construct a rule set with caller provided hotword, with a likelihood # boost to VERY_LIKELY when the hotword are present within the 50 character- # window preceding the PII finding. hotword_rule = { "hotword_regex": { "pattern": custom_hotword }, "likelihood_adjustment": { "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY }, "proximity": { "window_before": 50 }, } rule_set = [{ "info_types": [{ "name": "PERSON_NAME" }], "rules": [{ "hotword_rule": hotword_rule }], }] # Construct the configuration dictionary with the custom regex info type. inspect_config = { "rule_set": rule_set, "min_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY, "include_quote": True, } # Construct the `item`. item = {"value": content_string} # Convert the project id into a full resource id. parent = f"projects/{project}" # Call the API. response = dlp.inspect_content(request={ "parent": parent, "inspect_config": inspect_config, "item": item }) # Print out the results. if response.result.findings: for finding in response.result.findings: print(f"Quote: {finding.quote}") print(f"Info type: {finding.info_type.name}") print(f"Likelihood: {finding.likelihood}") else: print("No findings.")
def inspect_string_without_overlap(project, content_string): """Matches EMAIL_ADDRESS and DOMAIN_NAME, but DOMAIN_NAME is omitted if it overlaps with EMAIL_ADDRESS Uses the Data Loss Prevention API to omit matches of one infotype that overlap with another. Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Construct a list of infoTypes for DLP to locate in `content_string`. See # https://cloud.google.com/dlp/docs/concepts-infotypes for more information # about supported infoTypes. info_types_to_locate = [{"name": "DOMAIN_NAME"}, {"name": "EMAIL_ADDRESS"}] # Define a custom info type to exclude email addresses custom_info_types = [{ "info_type": { "name": "EMAIL_ADDRESS" }, "exclusion_type": google.cloud.dlp_v2.CustomInfoType.ExclusionType. EXCLUSION_TYPE_EXCLUDE, }] # Construct a rule set that will exclude DOMAIN_NAME matches # that overlap with EMAIL_ADDRESS matches rule_set = [{ "info_types": [{ "name": "DOMAIN_NAME" }], "rules": [{ "exclusion_rule": { "exclude_info_types": { "info_types": [{ "name": "EMAIL_ADDRESS" }] }, "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, } }], }] # Construct the configuration dictionary inspect_config = { "info_types": info_types_to_locate, "custom_info_types": custom_info_types, "rule_set": rule_set, "include_quote": True, } # Construct the `item`. item = {"value": content_string} # Convert the project id into a full resource id. parent = f"projects/{project}" # Call the API. response = dlp.inspect_content(request={ "parent": parent, "inspect_config": inspect_config, "item": item }) # Print out the results. if response.result.findings: for finding in response.result.findings: print(f"Quote: {finding.quote}") print(f"Info type: {finding.info_type.name}") print(f"Likelihood: {finding.likelihood}") else: print("No findings.")
def inspect_file(project, filename, info_types, min_likelihood=None, custom_dictionaries=None, custom_regexes=None, max_findings=None, include_quote=True, mime_type=None): """Uses the Data Loss Prevention API to analyze a file for protected data. Args: project: The Google Cloud project id to use as a parent resource. filename: The path to the file to inspect. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. include_quote: Boolean for whether to display a quote of the detected information in the results. mime_type: The MIME type of the file. If not specified, the type is inferred via the Python standard library's mimetypes module. Returns: None; the response from the API is printed to the terminal. """ import mimetypes # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { 'word_list': {'words': custom_dict.split(',')} } } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" 'image/jpeg': 1, 'image/bmp': 2, 'image/png': 3, 'image/svg': 4, 'text/plain': 5, } content_type_index = supported_content_types.get(mime_type, 0) # Construct the item, containing the file's byte data. with open(filename, mode='rb') as f: item = {'byte_item': {'type': content_type_index, 'data': f.read()}} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: print('Quote: {}'.format(finding.quote)) except AttributeError: pass print('Info type: {}'.format(finding.info_type.name)) print('Likelihood: {}'.format(finding.likelihood)) else: print('No findings.')
def inspect_string_with_exclusion_dict(project, content_string, exclusion_list=["*****@*****.**"]): """Inspects the provided text, avoiding matches specified in the exclusion list Uses the Data Loss Prevention API to omit matches on EMAIL_ADDRESS if they are in the specified exclusion list. Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. exclusion_list: The list of strings to ignore matches on Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Construct a list of infoTypes for DLP to locate in `content_string`. See # https://cloud.google.com/dlp/docs/concepts-infotypes for more information # about supported infoTypes. info_types_to_locate = [{"name": "EMAIL_ADDRESS"}] # Construct a rule set that will only match on EMAIL_ADDRESS # if the match text is not in the exclusion list. rule_set = [{ "info_types": info_types_to_locate, "rules": [{ "exclusion_rule": { "dictionary": { "word_list": { "words": exclusion_list }, }, "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH, } }], }] # Construct the configuration dictionary inspect_config = { "info_types": info_types_to_locate, "rule_set": rule_set, "include_quote": True, } # Construct the `item`. item = {"value": content_string} # Convert the project id into a full resource id. parent = f"projects/{project}" # Call the API. response = dlp.inspect_content(request={ "parent": parent, "inspect_config": inspect_config, "item": item }) # Print out the results. if response.result.findings: for finding in response.result.findings: print(f"Quote: {finding.quote}") print(f"Info type: {finding.info_type.name}") print(f"Likelihood: {finding.likelihood}") else: print("No findings.")
def inspect_string( project, content_string, info_types, custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, include_quote=True, ): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. include_quote: Boolean for whether to display a quote of the detected information in the results. Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). info_types = [{"name": info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [ { "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, "dictionary": {"word_list": {"words": custom_dict.split(",")}}, } for i, custom_dict in enumerate(custom_dictionaries) ] if custom_regexes is None: custom_regexes = [] regexes = [ { "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, "regex": {"pattern": custom_regex}, } for i, custom_regex in enumerate(custom_regexes) ] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { "info_types": info_types, "custom_info_types": custom_info_types, "min_likelihood": min_likelihood, "include_quote": include_quote, "limits": {"max_findings_per_request": max_findings}, } # Construct the `item`. item = {"value": content_string} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: if finding.quote: print("Quote: {}".format(finding.quote)) except AttributeError: pass print("Info type: {}".format(finding.info_type.name)) print("Likelihood: {}".format(finding.likelihood)) else: print("No findings.")
def inspect_file(project, filename, info_types, min_likelihood=None, custom_dictionaries=None, custom_regexes=None, max_findings=None, include_quote=True, mime_type=None): """Uses the Data Loss Prevention API to analyze a file for protected data. Args: project: The Google Cloud project id to use as a parent resource. filename: The path to the file to inspect. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. timeout: The number of seconds to wait for a response from the API. include_quote: Boolean for whether to display a quote of the detected information in the results. mime_type: The MIME type of the file. If not specified, the type is inferred via the Python standard library's mimetypes module. Returns: None; the response from the API is printed to the terminal. """ import mimetypes # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: info_types = [{'name': 'ALL_BASIC'}] else: info_types = [{'name': info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { 'word_list': {'words': custom_dict.split(',')} } } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" 'image/jpeg': 1, 'image/bmp': 2, 'image/png': 3, 'image/svg': 4, 'text/plain': 5, } content_type_index = supported_content_types.get(mime_type, 0) # Construct the item, containing the file's byte data. with open(filename, mode='rb') as f: item = {'byte_item': {'type': content_type_index, 'data': f.read()}} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: print('Quote: {}'.format(finding.quote)) except AttributeError: pass print('Info type: {}'.format(finding.info_type.name)) print('Likelihood: {}'.format(finding.likelihood)) else: print('No findings.')
def inspect_string_custom_excluding_substring(project, content_string, exclusion_list=["jimmy"]): """Inspects the provided text with a custom detector, avoiding matches on specific tokens Uses the Data Loss Prevention API to omit matches on a custom detector if they include tokens in the specified exclusion list. Args: project: The Google Cloud project id to use as a parent resource. content_string: The string to inspect. exclusion_list: The list of strings to ignore matches on Returns: None; the response from the API is printed to the terminal. """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Construct a custom regex detector for names custom_info_types = [{ "info_type": { "name": "CUSTOM_NAME_DETECTOR" }, "regex": { "pattern": "[A-Z][a-z]{1,15}, [A-Z][a-z]{1,15}" }, }] # Construct a rule set that will only match if the match text does not # contains tokens from the exclusion list. rule_set = [{ "info_types": [{ "name": "CUSTOM_NAME_DETECTOR" }], "rules": [{ "exclusion_rule": { "dictionary": { "word_list": { "words": exclusion_list }, }, "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, } }], }] # Construct the configuration dictionary inspect_config = { "custom_info_types": custom_info_types, "rule_set": rule_set, "include_quote": True, } # Construct the `item`. item = {"value": content_string} # Convert the project id into a full resource id. parent = f"projects/{project}" # Call the API. response = dlp.inspect_content(request={ "parent": parent, "inspect_config": inspect_config, "item": item }) # Print out the results. if response.result.findings: for finding in response.result.findings: print(f"Quote: {finding.quote}") print(f"Info type: {finding.info_type.name}") print(f"Likelihood: {finding.likelihood}") else: print("No findings.")
def inspect_table(project, data, info_types, custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, include_quote=True): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: project: The Google Cloud project id to use as a parent resource. data: Json string representing table data. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. include_quote: Boolean for whether to display a quote of the detected information in the results. Returns: None; the response from the API is printed to the terminal. Example: data = { "header":[ "email", "phone number" ], "rows":[ [ "*****@*****.**", "4232342345" ], [ "*****@*****.**", "4253458383" ] ] } >> $ python inspect_content.py table \ '{"header": ["email", "phone number"], "rows": [["*****@*****.**", "4232342345"], ["*****@*****.**", "4253458383"]]}' >> Quote: [email protected] Info type: EMAIL_ADDRESS Likelihood: 4 Quote: [email protected] Info type: EMAIL_ADDRESS Likelihood: 4 """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). info_types = [{'name': info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { 'word_list': {'words': custom_dict.split(',')} } } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': {'max_findings_per_request': max_findings}, } # Construct the `table`. For more details on the table schema, please see # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table headers = [{"name": val} for val in data["header"]] rows = [] for row in data["rows"]: rows.append({ "values": [{"string_value": cell_val} for cell_val in row] }) table = {} table["headers"] = headers table["rows"] = rows item = {"table": table} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: if finding.quote: print('Quote: {}'.format(finding.quote)) except AttributeError: pass print('Info type: {}'.format(finding.info_type.name)) print('Likelihood: {}'.format(finding.likelihood)) else: print('No findings.')
def inspect_file(project, filename, info_types, min_likelihood=None, custom_dictionaries=None, custom_regexes=None, max_findings=None, include_quote=True, mime_type=None): # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # The minimum likelihood to constitute a match. Optional. min_likelihood = 'LIKELY' # The maximum number of findings to report (0 = server maximum). Optional. max_findings = 0 # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': { 'max_findings_per_request': max_findings }, } if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" 'image/jpeg': 1, 'image/bmp': 2, 'image/png': 3, 'image/svg': 4, 'text/plain': 5 } content_type_index = supported_content_types.get(mime_type, 0) # Construct the item, containing the file's byte data. with open(filename, mode='rb') as f: item = {'byte_item': {'type': content_type_index, 'data': f.read()}} #item = {'value': filename} # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. FIRST_NAMES = [] LAST_NAMES = [] LOCATIONS = [] US_STATES = [] if response.result.findings: print('the total number of findings is: ', len(response.result.findings)) for finding in response.result.findings: if finding.info_type.name == 'FIRST_NAME': FIRST_NAMES.append(finding.info_type.name) elif finding.info_type.name == 'LAST_NAME': LAST_NAMES.append(finding.info_type.name) elif finding.info_type.name == 'LOCATION': LOCATIONS.append(finding.info_type.name) elif finding.info_type.name == 'US_STATE': US_STATES.append(finding.info_type.name) if len(response.result.findings) >= 10: return 'sensitive_data_found_' + filename else: return 'no_sensitve_data_found_' + filename print('TOTAL_FINDINGS = ', len(response.result.findings)) print('FIRST_NAMES = ', len(FIRST_NAMES)) print('LAST_NAMES = ', len(LAST_NAMES)) print('LOCATIONS = ', len(LOCATIONS)) print('US_STATES = ', len(US_STATES)) else: print('No findings.') return 'no_sensitve_data_found_' + filename
def inspect_table( project, data, info_types, custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, include_quote=True, ): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: project: The Google Cloud project id to use as a parent resource. data: Json string representing table data. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. include_quote: Boolean for whether to display a quote of the detected information in the results. Returns: None; the response from the API is printed to the terminal. Example: data = { "header":[ "email", "phone number" ], "rows":[ [ "*****@*****.**", "4232342345" ], [ "*****@*****.**", "4253458383" ] ] } >> $ python inspect_content.py table \ '{"header": ["email", "phone number"], "rows": [["*****@*****.**", "4232342345"], ["*****@*****.**", "4253458383"]]}' >> Quote: [email protected] Info type: EMAIL_ADDRESS Likelihood: 4 Quote: [email protected] Info type: EMAIL_ADDRESS Likelihood: 4 """ # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). info_types = [{"name": info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [ { "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, "dictionary": {"word_list": {"words": custom_dict.split(",")}}, } for i, custom_dict in enumerate(custom_dictionaries) ] if custom_regexes is None: custom_regexes = [] regexes = [ { "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, "regex": {"pattern": custom_regex}, } for i, custom_regex in enumerate(custom_regexes) ] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { "info_types": info_types, "custom_info_types": custom_info_types, "min_likelihood": min_likelihood, "include_quote": include_quote, "limits": {"max_findings_per_request": max_findings}, } # Construct the `table`. For more details on the table schema, please see # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table headers = [{"name": val} for val in data["header"]] rows = [] for row in data["rows"]: rows.append({"values": [{"string_value": cell_val} for cell_val in row]}) table = {} table["headers"] = headers table["rows"] = rows item = {"table": table} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: if finding.quote: print("Quote: {}".format(finding.quote)) except AttributeError: pass print("Info type: {}".format(finding.info_type.name)) print("Likelihood: {}".format(finding.likelihood)) else: print("No findings.")
def quickstart(): """Demonstrates use of the Data Loss Prevention API client library.""" # [START dlp_quickstart] # Import the client library import google.cloud.dlp # Edit this with your Google Cloud Project ID. project = 'your-project' # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() # The string to inspect content = 'Robert Frost' # Construct the item to inspect. item = {'value': content} # The info types to search for in the content. Required. info_types = [{'name': 'FIRST_NAME'}, {'name': 'LAST_NAME'}] # The minimum likelihood to constitute a match. Optional. min_likelihood = 'LIKELIHOOD_UNSPECIFIED' # The maximum number of findings to report (0 = server maximum). Optional. max_findings = 0 # Whether to include the matching string in the results. Optional. include_quote = True # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': {'max_findings_per_request': max_findings}, } # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: print('Quote: {}'.format(finding.quote)) except AttributeError: pass print('Info type: {}'.format(finding.info_type.name)) # Convert likelihood value to string respresentation. likelihood = (google.cloud.dlp.types.Finding.DESCRIPTOR .fields_by_name['likelihood'] .enum_type.values_by_number[finding.likelihood] .name) print('Likelihood: {}'.format(likelihood)) else: print('No findings.')
def inspect_file( project, filename, info_types, min_likelihood=None, custom_dictionaries=None, custom_regexes=None, max_findings=None, include_quote=True, mime_type=None, ): """Uses the Data Loss Prevention API to analyze a file for protected data. Args: project: The Google Cloud project id to use as a parent resource. filename: The path to the file to inspect. info_types: A list of strings representing info types to look for. A full list of info type categories can be fetched from the API. min_likelihood: A string representing the minimum likelihood threshold that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. max_findings: The maximum number of findings to report; 0 = no maximum. include_quote: Boolean for whether to display a quote of the detected information in the results. mime_type: The MIME type of the file. If not specified, the type is inferred via the Python standard library's mimetypes module. Returns: None; the response from the API is printed to the terminal. """ import mimetypes # Import the client library. import google.cloud.dlp # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] info_types = [{"name": info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [ { "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, "dictionary": {"word_list": {"words": custom_dict.split(",")}}, } for i, custom_dict in enumerate(custom_dictionaries) ] if custom_regexes is None: custom_regexes = [] regexes = [ { "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, "regex": {"pattern": custom_regex}, } for i, custom_regex in enumerate(custom_regexes) ] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { "info_types": info_types, "custom_info_types": custom_info_types, "min_likelihood": min_likelihood, "limits": {"max_findings_per_request": max_findings}, } # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) mime_type = mime_guess[0] # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" "image/jpeg": 1, "image/bmp": 2, "image/png": 3, "image/svg": 4, "text/plain": 5, } content_type_index = supported_content_types.get(mime_type, 0) # Construct the item, containing the file's byte data. with open(filename, mode="rb") as f: item = {"byte_item": {"type": content_type_index, "data": f.read()}} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: print("Quote: {}".format(finding.quote)) except AttributeError: pass print("Info type: {}".format(finding.info_type.name)) print("Likelihood: {}".format(finding.likelihood)) else: print("No findings.")
def main(custom_dictionaries=None, custom_regexes=None, mime_type=None): args = parse_arguments() # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(args.file) mime_type = mime_guess[0] # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" 'image/jpeg': 1, 'image/bmp': 2, 'image/png': 3, 'image/svg': 4, 'text/plain': 5, } content_type_index = supported_content_types.get(mime_type, 0) # The file to inspect with open(args.file, mode='rb') as f: item = {'byte_item': {'type': content_type_index, 'data': f.read()}} # The info types to search for in the content. Required. info_types = [{'name': 'FIRST_NAME'}, {'name': 'LAST_NAME'}, {'name': 'CREDIT_CARD_NUMBER'} ] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { 'word_list': {'words': custom_dict.split(',')} } } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # The minimum likelihood to constitute a match. Optional. min_likelihood = 'LIKELIHOOD_UNSPECIFIED' # The maximum number of findings to report (0 = server maximum). Optional. max_findings = 0 # Whether to include the matching string in the results. Optional. include_quote = True # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': {'max_findings_per_request': max_findings}, } # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) # Print out the results. if response.result.findings: for finding in response.result.findings: try: print('Quote: {}'.format(finding.quote)) except AttributeError: pass print('Info type: {}'.format(finding.info_type.name)) # Convert likelihood value to string respresentation. print('Likelihood: {}'.format(finding.likelihood)) else: print('No findings.')