Beispiel #1
0
def inspect_string(project,
                   content_string,
                   info_types,
                   custom_dictionaries=None,
                   custom_regexes=None,
                   min_likelihood=None,
                   max_findings=None,
                   include_quote=True):

    dlp = google.cloud.dlp.DlpServiceClient()

    parent = dlp.project_path(project)
    item = {'value': content_string}
    info_types = [{'name': info_types}]
    max_findings = 0
    include_quote = True
    min_likelihood = 'LIKELIHOOD_UNSPECIFIED'
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {
            'max_findings_per_request': max_findings
        },
    }

    response = dlp.inspect_content(parent, inspect_config, item)
    return response
Beispiel #2
0
def inspectdata(data, project_id, template_id):
    dlp = google.cloud.dlp_v2.DlpServiceClient()
    parent = dlp.project_path(project_id)
    inspect_template = f"projects/{project_id}/inspectTemplates/{template_id}"
    response = dlp.inspect_content(parent,
                                   inspect_template_name=inspect_template,
                                   item=data)
    return response
Beispiel #3
0
def inspect_with_medical_record_number_custom_regex_detector(
    project,
    content_string,
):
    """Uses the Data Loss Prevention API to analyze string with medical record
       number custom regex detector
    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct a custom regex detector info type called "C_MRN",
    # with ###-#-##### pattern, where each # represents a digit from 1 to 9.
    # The detector has a detection likelihood of POSSIBLE.
    custom_info_types = [{
        "info_type": {
            "name": "C_MRN"
        },
        "regex": {
            "pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"
        },
        "likelihood": "POSSIBLE",
    }]

    # Construct the configuration dictionary with the custom regex info type.
    inspect_config = {
        "custom_info_types": custom_info_types,
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print(f"Quote: {finding.quote}")
            except AttributeError:
                pass
            print(f"Info type: {finding.info_type.name}")
            print(f"Likelihood: {finding.likelihood}")
    else:
        print("No findings.")
def inspect_string(item,
                   info_types=None,
                   min_likelihood=None,
                   max_findings=None,
                   include_quote=True):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        item: The string to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API. If
            info_types is omitted, the API will use a limited default set.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if info_types is not None:
        info_types = [{'name': info_type} for info_type in info_types]

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'max_findings': max_findings,
        'include_quote': include_quote,
    }

    # Construct the items list (in this case, only one item, in string form).
    items = [{'type': 'text/plain', 'value': item}]

    # Call the API.
    response = dlp.inspect_content(inspect_config, items)

    # Print out the results.
    if response.results[0].findings:
        for finding in response.results[0].findings:
            try:
                print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')
def inspect_string(project, content_string, info_types,
                   min_likelihood=None, max_findings=None, include_quote=True):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {'max_findings_per_request': max_findings},
      }

    # Construct the `item`.
    item = {'value': content_string}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')
Beispiel #6
0
def inspect(text):
    dlp = google.cloud.dlp_v2.DlpServiceClient()
    parent = dlp.project_path('roi-gcp-demos')

    inspect_config = {
        "info_types": [
            {
                "name": "EMAIL_ADDRESS"
            },
            {
                "name": "CREDIT_CARD_NUMBER"
            },
            {
                "name": "GENERIC_ID"
            },
            {
                "name": "IP_ADDRESS"
            },
            {
                "name": "PHONE_NUMBER"
            },
            {
                "name": "US_DRIVERS_LICENSE_NUMBER"
            },
            {
                "name": "US_SOCIAL_SECURITY_NUMBER"
            },
        ],
        "include_quote":
        True
    }

    item = {"value": text}

    # Call the API
    response = dlp.inspect_content(
        parent,
        inspect_config=inspect_config,
        item=item,
    )

    result = ""
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    result += "Quote: {}<br>".format(finding.quote)
            except AttributeError:
                pass
            result += "Info type: {}<br>".format(finding.info_type.name)
            result += "Likelihood: {}<br>".format(finding.likelihood)
            result += "<br>"
    else:
        result = "No findings."
    return {"result": result}
Beispiel #7
0
def inspect_file(project,
                 filename,
                 info_types,
                 custom_dictionaries=None,
                 custom_regexes=None,
                 min_likelihood=None,
                 max_findings=None,
                 include_quote=True,
                 mime_type=None):

    dlp = google.cloud.dlp.DlpServiceClient()

    # I suppose in theory these could just be templates if
    # I understand the concept correctly.
    info_types = [{'name': info_types}]
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'limits': {
            'max_findings_per_request': max_findings
        },
        'include_quote': include_quote,
    }

    # I guess a mime_type, but I don't know why.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0]

    supported_content_types = {
        None: 0,
    }
    content_type_index = supported_content_types.get(mime_type, 0)
    try:
        headers = {'Authorization': 'Bearer ' + os.environ['TOKEN']}
        r = requests.get(filename, headers=headers, stream=True)

        with open(filename.split('/')[-1], 'wb') as f:
            f.write(r.content)
    except Exception as e:
        return e
    if 'xlsx' in filename.split('/')[-1]:
        return 'I am unable to read this file.  Please try again.'

    with open(filename.split('/')[-1], mode='rb') as f:
        item = {'byte_item': {'type': content_type_index, 'data': f.read()}}

    parent = dlp.project_path(project)
    response = dlp.inspect_content(parent, inspect_config, item)
    return response.result
Beispiel #8
0
def redact_text(data, project):
    logging.info(data)
    dlp = google.cloud.dlp_v2.DlpServiceClient()
    parent = dlp.project_path(project)
    response = dlp.list_info_types('en-US')

    # This will detect PII data for the info types listed
    # https://cloud.google.com/dlp/docs/infotypes-reference
    info_types = [
        "PERSON_NAME", "PHONE_NUMBER", "ORGANIZATION_NAME", "FIRST_NAME",
        "LAST_NAME", "EMAIL_ADDRESS", "DATE_OF_BIRTH", "EMAIL_ADDRESS",
        "US_SOCIAL_SECURITY_NUMBER", "STREET_ADDRESS"
    ]

    info_types = [{"name": info_type} for info_type in info_types]

    inspect_config = {"info_types": info_types, "include_quote": True}
    logging.info(data['transcript'])
    item = {"value": data['transcript']}
    response = dlp.inspect_content(
        parent,
        inspect_config=inspect_config,
        item=item,
    )
    logging.info(response)
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print("Quote: {}".format(finding.quote))
                    data['dlp'].append(finding.quote)
            except AttributeError:
                pass
        else:
            print("No findings.")
    return data
Beispiel #9
0
def dlp_inspect(message, custom_dictionaries=None, custom_regexes=None):
    """Inspect a message from posted to a Slack channel using Google DLP API.
    If data matches identifers, post a messsage in the Original channel and IR channel."""
    # Edit this with your Google Cloud Project ID.
    project = os.environ["GOOGLE_CLOUD_PROJECT"]

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # The text to inspect
    item = {'value': message["text"]}

    # The info types to search for in the content. Required.
    info_types = [{
        'name': 'US_SOCIAL_SECURITY_NUMBER'
    }, {
        'name': 'US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER'
    }, {
        'name': 'CANADA_SOCIAL_INSURANCE_NUMBER'
    }, {
        'name': 'CREDIT_CARD_NUMBER'
    }]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = ['Test_Keyword']
    dictionaries = [{
        'info_type': {
            'name': 'CUSTOM_DICTIONARY_{}'.format(i)
        },
        'dictionary': {
            'word_list': {
                'words': custom_dict.split(',')
            }
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {
            'name': 'CUSTOM_REGEX_{}'.format(i)
        },
        'regex': {
            'pattern': custom_regex
        }
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # The minimum likelihood to constitute a match. Optional.
    min_likelihood = 'LIKELIHOOD_UNSPECIFIED'

    # The maximum number of findings to report (0 = server maximum). Optional.
    max_findings = 0

    # Whether to include the matching string in the results. Optional.
    include_quote = True

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {
            'max_findings_per_request': max_findings
        },
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Send results to Slack Channels.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                channel = message["channel"]
                # Translate the encoded channel name into it's actual name.
                channel_translated = CLIENT.api_call("channels.info",
                                                     channel=channel)
                user = CLIENT.api_call("users.info", user=message["user"])
                #If you have an IR channel you want to alert in. Fill it in here. Otherwise, comment it out.
                ir_channel = os.environ["ir_channel"]
                # Send a message to notify the channel where the sensitive data was found.
                bot_message = "The following text in your message was found to have sensitive data: `{}`. Type: `{}`.".format(
                    finding.quote, finding.info_type.name)
                # Send a message to notify an Incident Response channel that sensitive data was found. If no IR channel is being used, comment it out.
                ir_message = "<@{}> might have posted some sensitive data in #{}. You might want to check it out.".format(
                    user["user"]["name"],
                    channel_translated["channel"]["name"])
                # Post alert message in the channel where the data was found.
                CLIENT.api_call("chat.postMessage",
                                channel=channel,
                                text=bot_message)
                # Post the message in the IR Channel. If not using an IR channel, comment it out.
                CLIENT.api_call("chat.postMessage",
                                channel=ir_channel,
                                text=ir_message)
            except AttributeError:
                pass
def inspect_file(filename,
                 info_types=None,
                 min_likelihood=None,
                 max_findings=None,
                 include_quote=True,
                 mime_type=None):
    """Uses the Data Loss Prevention API to analyze a file for protected data.
    Args:
        filename: The path to the file to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API. If
            info_types is omitted, the API will use a limited default set.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    import mimetypes

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if info_types is not None:
        info_types = [{'name': info_type} for info_type in info_types]

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'max_findings': max_findings,
        'include_quote': include_quote,
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0] or 'application/octet-stream'

    # Construct the items list (in this case, only one item, containing the
    # file's byte data).
    with open(filename, mode='rb') as f:
        items = [{'type': mime_type, 'data': f.read()}]

    # Call the API.
    response = dlp.inspect_content(inspect_config, items)

    # Print out the results.
    if response.results[0].findings:
        for finding in response.results[0].findings:
            try:
                print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')
def omit_name_if_also_email(
    project,
    content_string,
):
    """Marches PERSON_NAME and EMAIL_ADDRESS, but not both.

    Uses the Data Loss Prevention API omit matches on PERSON_NAME if the
    EMAIL_ADDRESS detector also matches.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct a list of infoTypes for DLP to locate in `content_string`. See
    # https://cloud.google.com/dlp/docs/concepts-infotypes for more information
    # about supported infoTypes.
    info_types_to_locate = [{"name": "PERSON_NAME"}, {"name": "EMAIL_ADDRESS"}]

    # Construct the configuration dictionary that will only match on PERSON_NAME
    # if the EMAIL_ADDRESS doesn't also match. This configuration helps reduce
    # the total number of findings when there is a large overlap between different
    # infoTypes.
    inspect_config = {
        "info_types":
        info_types_to_locate,
        "rule_set": [{
            "info_types": [{
                "name": "PERSON_NAME"
            }],
            "rules": [{
                "exclusion_rule": {
                    "exclude_info_types": {
                        "info_types": [{
                            "name": "EMAIL_ADDRESS"
                        }]
                    },
                    "matching_type": "MATCHING_TYPE_PARTIAL_MATCH"
                }
            }]
        }]
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    return [f.info_type.name for f in response.result.findings]
Beispiel #12
0
def inspect_string_custom_omit_overlap(project, content_string):
    """Matches PERSON_NAME and a custom detector,
    but if they overlap only matches the custom detector

    Uses the Data Loss Prevention API to omit matches on a built-in detector
    if they overlap with matches from a custom detector

    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct a custom regex detector for names
    custom_info_types = [{
        "info_type": {
            "name": "VIP_DETECTOR"
        },
        "regex": {
            "pattern": "Larry Page|Sergey Brin"
        },
        "exclusion_type":
        google.cloud.dlp_v2.CustomInfoType.ExclusionType.
        EXCLUSION_TYPE_EXCLUDE,
    }]

    # Construct a rule set that will exclude PERSON_NAME matches
    # that overlap with VIP_DETECTOR matches
    rule_set = [{
        "info_types": [{
            "name": "PERSON_NAME"
        }],
        "rules": [{
            "exclusion_rule": {
                "exclude_info_types": {
                    "info_types": [{
                        "name": "VIP_DETECTOR"
                    }]
                },
                "matching_type":
                google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH,
            }
        }],
    }]

    # Construct the configuration dictionary
    inspect_config = {
        "info_types": [{
            "name": "PERSON_NAME"
        }],
        "custom_info_types": custom_info_types,
        "rule_set": rule_set,
        "include_quote": True,
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Call the API.
    response = dlp.inspect_content(request={
        "parent": parent,
        "inspect_config": inspect_config,
        "item": item
    })

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            print(f"Quote: {finding.quote}")
            print(f"Info type: {finding.info_type.name}")
            print(f"Likelihood: {finding.likelihood}")
    else:
        print("No findings.")
Beispiel #13
0
def inspect_file(project, filename, info_types, min_likelihood=None,
                 max_findings=None, include_quote=True, mime_type=None):

    """Call Data Loss Prevention API to analyze a file for protected data.
    Args:
        project: The GCP project id to use as a parent resource.
        filename: The path to the file to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    import mimetypes

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
    info_types = [{'name': info_type} for info_type in info_types]

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'limits': {'max_findings_per_request': max_findings},
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0]

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5,
    }
    content_type_index = supported_content_types.get(mime_type, 1)

    # Construct the item, containing the file's byte data.
    with open(filename, mode='rb') as f:
        item = {'byte_item': {'type': content_type_index, 'data': f.read()}}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))

            boxes = finding.location.content_locations[0].image_location.bounding_boxes

            for box in boxes:
            	#print('box: {}'.format(box))
            	pass

            return boxes

    else:
        print('No findings.')
Beispiel #14
0
def inspect_string(project,
                   content_string,
                   info_types,
                   min_likelihood=None,
                   max_findings=None,
                   include_quote=True):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {
            'max_findings_per_request': max_findings
        },
    }

    # Construct the `item`.
    item = {'value': content_string}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')
Beispiel #15
0
def inspect_with_medical_record_number_w_custom_hotwords(
    project,
    content_string,
):
    """Uses the Data Loss Prevention API to analyze string with medical record
       number custom regex detector, with custom hotwords rules to boost finding
       certainty under some circumstances.

    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct a custom regex detector info type called "C_MRN",
    # with ###-#-##### pattern, where each # represents a digit from 1 to 9.
    # The detector has a detection likelihood of POSSIBLE.
    custom_info_types = [{
        "info_type": {
            "name": "C_MRN"
        },
        "regex": {
            "pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"
        },
        "likelihood": google.cloud.dlp_v2.Likelihood.POSSIBLE,
    }]

    # Construct a rule set with hotwords "mrn" and "medical", with a likelohood
    # boost to VERY_LIKELY when hotwords are present within the 10 character-
    # window preceding the PII finding.
    hotword_rule = {
        "hotword_regex": {
            "pattern": "(?i)(mrn|medical)(?-i)"
        },
        "likelihood_adjustment": {
            "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY
        },
        "proximity": {
            "window_before": 10
        },
    }

    rule_set = [{
        "info_types": [{
            "name": "C_MRN"
        }],
        "rules": [{
            "hotword_rule": hotword_rule
        }]
    }]

    # Construct the configuration dictionary with the custom regex info type.
    inspect_config = {
        "custom_info_types": custom_info_types,
        "rule_set": rule_set,
        "include_quote": True,
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Call the API.
    response = dlp.inspect_content(request={
        "parent": parent,
        "inspect_config": inspect_config,
        "item": item
    })

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            print(f"Quote: {finding.quote}")
            print(f"Info type: {finding.info_type.name}")
            print(f"Likelihood: {finding.likelihood}")
    else:
        print("No findings.")
Beispiel #16
0
def inspect_string_multiple_rules(project, content_string):
    """Uses the Data Loss Prevention API to modify likelihood for matches on
       PERSON_NAME combining multiple hotword and exclusion rules.

    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct hotword rules
    patient_rule = {
        "hotword_regex": {
            "pattern": "patient"
        },
        "proximity": {
            "window_before": 10
        },
        "likelihood_adjustment": {
            "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY
        },
    }
    doctor_rule = {
        "hotword_regex": {
            "pattern": "doctor"
        },
        "proximity": {
            "window_before": 10
        },
        "likelihood_adjustment": {
            "fixed_likelihood": google.cloud.dlp_v2.Likelihood.UNLIKELY
        },
    }

    # Construct exclusion rules
    quasimodo_rule = {
        "dictionary": {
            "word_list": {
                "words": ["quasimodo"]
            },
        },
        "matching_type":
        google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH,
    }
    redacted_rule = {
        "regex": {
            "pattern": "REDACTED"
        },
        "matching_type":
        google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH,
    }

    # Construct the rule set, combining the above rules
    rule_set = [{
        "info_types": [{
            "name": "PERSON_NAME"
        }],
        "rules": [
            {
                "hotword_rule": patient_rule
            },
            {
                "hotword_rule": doctor_rule
            },
            {
                "exclusion_rule": quasimodo_rule
            },
            {
                "exclusion_rule": redacted_rule
            },
        ],
    }]

    # Construct the configuration dictionary
    inspect_config = {
        "info_types": [{
            "name": "PERSON_NAME"
        }],
        "rule_set": rule_set,
        "include_quote": True,
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Call the API.
    response = dlp.inspect_content(request={
        "parent": parent,
        "inspect_config": inspect_config,
        "item": item
    })

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            print(f"Quote: {finding.quote}")
            print(f"Info type: {finding.info_type.name}")
            print(f"Likelihood: {finding.likelihood}")
    else:
        print("No findings.")
Beispiel #17
0
def inspect_with_person_name_w_custom_hotword(project,
                                              content_string,
                                              custom_hotword="patient"):
    """Uses the Data Loss Prevention API increase likelihood for matches on
       PERSON_NAME if the user specified custom hotword is present. Only
       includes findings with the increased likelihood by setting a minimum
       likelihood threshold of VERY_LIKELY.

    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.
        custom_hotword: The custom hotword used for likelihood boosting.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct a rule set with caller provided hotword, with a likelihood
    # boost to VERY_LIKELY when the hotword are present within the 50 character-
    # window preceding the PII finding.
    hotword_rule = {
        "hotword_regex": {
            "pattern": custom_hotword
        },
        "likelihood_adjustment": {
            "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY
        },
        "proximity": {
            "window_before": 50
        },
    }

    rule_set = [{
        "info_types": [{
            "name": "PERSON_NAME"
        }],
        "rules": [{
            "hotword_rule": hotword_rule
        }],
    }]

    # Construct the configuration dictionary with the custom regex info type.
    inspect_config = {
        "rule_set": rule_set,
        "min_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY,
        "include_quote": True,
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Call the API.
    response = dlp.inspect_content(request={
        "parent": parent,
        "inspect_config": inspect_config,
        "item": item
    })

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            print(f"Quote: {finding.quote}")
            print(f"Info type: {finding.info_type.name}")
            print(f"Likelihood: {finding.likelihood}")
    else:
        print("No findings.")
Beispiel #18
0
def inspect_string_without_overlap(project, content_string):
    """Matches EMAIL_ADDRESS and DOMAIN_NAME, but DOMAIN_NAME is omitted
    if it overlaps with EMAIL_ADDRESS

    Uses the Data Loss Prevention API to omit matches of one infotype
    that overlap with another.

    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct a list of infoTypes for DLP to locate in `content_string`. See
    # https://cloud.google.com/dlp/docs/concepts-infotypes for more information
    # about supported infoTypes.
    info_types_to_locate = [{"name": "DOMAIN_NAME"}, {"name": "EMAIL_ADDRESS"}]

    # Define a custom info type to exclude email addresses
    custom_info_types = [{
        "info_type": {
            "name": "EMAIL_ADDRESS"
        },
        "exclusion_type":
        google.cloud.dlp_v2.CustomInfoType.ExclusionType.
        EXCLUSION_TYPE_EXCLUDE,
    }]

    # Construct a rule set that will exclude DOMAIN_NAME matches
    # that overlap with EMAIL_ADDRESS matches
    rule_set = [{
        "info_types": [{
            "name": "DOMAIN_NAME"
        }],
        "rules": [{
            "exclusion_rule": {
                "exclude_info_types": {
                    "info_types": [{
                        "name": "EMAIL_ADDRESS"
                    }]
                },
                "matching_type":
                google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH,
            }
        }],
    }]

    # Construct the configuration dictionary
    inspect_config = {
        "info_types": info_types_to_locate,
        "custom_info_types": custom_info_types,
        "rule_set": rule_set,
        "include_quote": True,
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Call the API.
    response = dlp.inspect_content(request={
        "parent": parent,
        "inspect_config": inspect_config,
        "item": item
    })

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            print(f"Quote: {finding.quote}")
            print(f"Info type: {finding.info_type.name}")
            print(f"Likelihood: {finding.likelihood}")
    else:
        print("No findings.")
def inspect_file(project, filename, info_types, min_likelihood=None,
                 custom_dictionaries=None, custom_regexes=None,
                 max_findings=None, include_quote=True, mime_type=None):
    """Uses the Data Loss Prevention API to analyze a file for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    import mimetypes

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
    info_types = [{'name': info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [{
        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
        'dictionary': {
            'word_list': {'words': custom_dict.split(',')}
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
        'regex': {'pattern': custom_regex}
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'limits': {'max_findings_per_request': max_findings},
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0]

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the item, containing the file's byte data.
    with open(filename, mode='rb') as f:
        item = {'byte_item': {'type': content_type_index, 'data': f.read()}}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')
Beispiel #20
0
def inspect_string_with_exclusion_dict(project,
                                       content_string,
                                       exclusion_list=["*****@*****.**"]):
    """Inspects the provided text, avoiding matches specified in the exclusion list

    Uses the Data Loss Prevention API to omit matches on EMAIL_ADDRESS if they are
    in the specified exclusion list.

    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.
        exclusion_list: The list of strings to ignore matches on

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct a list of infoTypes for DLP to locate in `content_string`. See
    # https://cloud.google.com/dlp/docs/concepts-infotypes for more information
    # about supported infoTypes.
    info_types_to_locate = [{"name": "EMAIL_ADDRESS"}]

    # Construct a rule set that will only match on EMAIL_ADDRESS
    # if the match text is not in the exclusion list.
    rule_set = [{
        "info_types":
        info_types_to_locate,
        "rules": [{
            "exclusion_rule": {
                "dictionary": {
                    "word_list": {
                        "words": exclusion_list
                    },
                },
                "matching_type":
                google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH,
            }
        }],
    }]

    # Construct the configuration dictionary
    inspect_config = {
        "info_types": info_types_to_locate,
        "rule_set": rule_set,
        "include_quote": True,
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Call the API.
    response = dlp.inspect_content(request={
        "parent": parent,
        "inspect_config": inspect_config,
        "item": item
    })

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            print(f"Quote: {finding.quote}")
            print(f"Info type: {finding.info_type.name}")
            print(f"Likelihood: {finding.likelihood}")
    else:
        print("No findings.")
Beispiel #21
0
def inspect_string(
    project,
    content_string,
    info_types,
    custom_dictionaries=None,
    custom_regexes=None,
    min_likelihood=None,
    max_findings=None,
    include_quote=True,
):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{"name": info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [
        {
            "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
            "dictionary": {"word_list": {"words": custom_dict.split(",")}},
        }
        for i, custom_dict in enumerate(custom_dictionaries)
    ]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [
        {
            "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
            "regex": {"pattern": custom_regex},
        }
        for i, custom_regex in enumerate(custom_regexes)
    ]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": info_types,
        "custom_info_types": custom_info_types,
        "min_likelihood": min_likelihood,
        "include_quote": include_quote,
        "limits": {"max_findings_per_request": max_findings},
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print("Quote: {}".format(finding.quote))
            except AttributeError:
                pass
            print("Info type: {}".format(finding.info_type.name))
            print("Likelihood: {}".format(finding.likelihood))
    else:
        print("No findings.")
Beispiel #22
0
def inspect_file(project, filename, info_types,
                 min_likelihood=None,
                 custom_dictionaries=None, custom_regexes=None,
                 max_findings=None, include_quote=True, mime_type=None):
    """Uses the Data Loss Prevention API to analyze a file for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        timeout: The number of seconds to wait for a response from the API.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    import mimetypes

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = [{'name': 'ALL_BASIC'}]
    else:
        info_types = [{'name': info_type} for info_type in info_types]


    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [{
        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
        'dictionary': {
            'word_list': {'words': custom_dict.split(',')}
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
        'regex': {'pattern': custom_regex}
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'limits': {'max_findings_per_request': max_findings},
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0]

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the item, containing the file's byte data.
    with open(filename, mode='rb') as f:
        item = {'byte_item': {'type': content_type_index, 'data': f.read()}}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')
Beispiel #23
0
def inspect_string_custom_excluding_substring(project,
                                              content_string,
                                              exclusion_list=["jimmy"]):
    """Inspects the provided text with a custom detector, avoiding matches on specific tokens

    Uses the Data Loss Prevention API to omit matches on a custom detector
    if they include tokens in the specified exclusion list.

    Args:
        project: The Google Cloud project id to use as a parent resource.
        content_string: The string to inspect.
        exclusion_list: The list of strings to ignore matches on

    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Construct a custom regex detector for names
    custom_info_types = [{
        "info_type": {
            "name": "CUSTOM_NAME_DETECTOR"
        },
        "regex": {
            "pattern": "[A-Z][a-z]{1,15}, [A-Z][a-z]{1,15}"
        },
    }]

    # Construct a rule set that will only match if the match text does not
    # contains tokens from the exclusion list.
    rule_set = [{
        "info_types": [{
            "name": "CUSTOM_NAME_DETECTOR"
        }],
        "rules": [{
            "exclusion_rule": {
                "dictionary": {
                    "word_list": {
                        "words": exclusion_list
                    },
                },
                "matching_type":
                google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH,
            }
        }],
    }]

    # Construct the configuration dictionary
    inspect_config = {
        "custom_info_types": custom_info_types,
        "rule_set": rule_set,
        "include_quote": True,
    }

    # Construct the `item`.
    item = {"value": content_string}

    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Call the API.
    response = dlp.inspect_content(request={
        "parent": parent,
        "inspect_config": inspect_config,
        "item": item
    })

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            print(f"Quote: {finding.quote}")
            print(f"Info type: {finding.info_type.name}")
            print(f"Likelihood: {finding.likelihood}")
    else:
        print("No findings.")
def inspect_table(project, data, info_types,
                  custom_dictionaries=None, custom_regexes=None,
                  min_likelihood=None, max_findings=None, include_quote=True):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        data: Json string representing table data.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    Example:
        data = {
            "header":[
                "email",
                "phone number"
            ],
            "rows":[
                [
                    "*****@*****.**",
                    "4232342345"
                ],
                [
                    "*****@*****.**",
                    "4253458383"
                ]
            ]
        }

        >> $ python inspect_content.py table \
        '{"header": ["email", "phone number"],
        "rows": [["*****@*****.**", "4232342345"],
        ["*****@*****.**", "4253458383"]]}'
        >>  Quote: [email protected]
            Info type: EMAIL_ADDRESS
            Likelihood: 4
            Quote: [email protected]
            Info type: EMAIL_ADDRESS
            Likelihood: 4
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{'name': info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [{
        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
        'dictionary': {
            'word_list': {'words': custom_dict.split(',')}
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
        'regex': {'pattern': custom_regex}
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {'max_findings_per_request': max_findings},
    }

    # Construct the `table`. For more details on the table schema, please see
    # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
    headers = [{"name": val} for val in data["header"]]
    rows = []
    for row in data["rows"]:
        rows.append({
            "values": [{"string_value": cell_val} for cell_val in row]
        })

    table = {}
    table["headers"] = headers
    table["rows"] = rows
    item = {"table": table}
    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')
def inspect_file(project,
                 filename,
                 info_types,
                 min_likelihood=None,
                 custom_dictionaries=None,
                 custom_regexes=None,
                 max_findings=None,
                 include_quote=True,
                 mime_type=None):

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # The minimum likelihood to constitute a match. Optional.
    min_likelihood = 'LIKELY'

    # The maximum number of findings to report (0 = server maximum). Optional.
    max_findings = 0

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {
            'max_findings_per_request': max_findings
        },
    }

    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0]

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the item, containing the file's byte data.
    with open(filename, mode='rb') as f:
        item = {'byte_item': {'type': content_type_index, 'data': f.read()}}
    #item = {'value': filename}
    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.

    FIRST_NAMES = []
    LAST_NAMES = []
    LOCATIONS = []
    US_STATES = []
    if response.result.findings:
        print('the total number of findings is: ',
              len(response.result.findings))
        for finding in response.result.findings:
            if finding.info_type.name == 'FIRST_NAME':
                FIRST_NAMES.append(finding.info_type.name)

            elif finding.info_type.name == 'LAST_NAME':
                LAST_NAMES.append(finding.info_type.name)

            elif finding.info_type.name == 'LOCATION':
                LOCATIONS.append(finding.info_type.name)

            elif finding.info_type.name == 'US_STATE':
                US_STATES.append(finding.info_type.name)

        if len(response.result.findings) >= 10:
            return 'sensitive_data_found_' + filename
        else:
            return 'no_sensitve_data_found_' + filename

        print('TOTAL_FINDINGS = ', len(response.result.findings))
        print('FIRST_NAMES = ', len(FIRST_NAMES))
        print('LAST_NAMES = ', len(LAST_NAMES))
        print('LOCATIONS = ', len(LOCATIONS))
        print('US_STATES = ', len(US_STATES))
    else:
        print('No findings.')
        return 'no_sensitve_data_found_' + filename
Beispiel #26
0
def inspect_table(
    project,
    data,
    info_types,
    custom_dictionaries=None,
    custom_regexes=None,
    min_likelihood=None,
    max_findings=None,
    include_quote=True,
):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        data: Json string representing table data.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    Example:
        data = {
            "header":[
                "email",
                "phone number"
            ],
            "rows":[
                [
                    "*****@*****.**",
                    "4232342345"
                ],
                [
                    "*****@*****.**",
                    "4253458383"
                ]
            ]
        }

        >> $ python inspect_content.py table \
        '{"header": ["email", "phone number"],
        "rows": [["*****@*****.**", "4232342345"],
        ["*****@*****.**", "4253458383"]]}'
        >>  Quote: [email protected]
            Info type: EMAIL_ADDRESS
            Likelihood: 4
            Quote: [email protected]
            Info type: EMAIL_ADDRESS
            Likelihood: 4
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{"name": info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [
        {
            "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
            "dictionary": {"word_list": {"words": custom_dict.split(",")}},
        }
        for i, custom_dict in enumerate(custom_dictionaries)
    ]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [
        {
            "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
            "regex": {"pattern": custom_regex},
        }
        for i, custom_regex in enumerate(custom_regexes)
    ]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": info_types,
        "custom_info_types": custom_info_types,
        "min_likelihood": min_likelihood,
        "include_quote": include_quote,
        "limits": {"max_findings_per_request": max_findings},
    }

    # Construct the `table`. For more details on the table schema, please see
    # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
    headers = [{"name": val} for val in data["header"]]
    rows = []
    for row in data["rows"]:
        rows.append({"values": [{"string_value": cell_val} for cell_val in row]})

    table = {}
    table["headers"] = headers
    table["rows"] = rows
    item = {"table": table}
    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print("Quote: {}".format(finding.quote))
            except AttributeError:
                pass
            print("Info type: {}".format(finding.info_type.name))
            print("Likelihood: {}".format(finding.likelihood))
    else:
        print("No findings.")
Beispiel #27
0
def quickstart():
    """Demonstrates use of the Data Loss Prevention API client library."""

    # [START dlp_quickstart]
    # Import the client library
    import google.cloud.dlp

    # Edit this with your Google Cloud Project ID.
    project = 'your-project'

    # Instantiate a client.
    dlp = google.cloud.dlp.DlpServiceClient()

    # The string to inspect
    content = 'Robert Frost'

    # Construct the item to inspect.
    item = {'value': content}

    # The info types to search for in the content. Required.
    info_types = [{'name': 'FIRST_NAME'}, {'name': 'LAST_NAME'}]

    # The minimum likelihood to constitute a match. Optional.
    min_likelihood = 'LIKELIHOOD_UNSPECIFIED'

    # The maximum number of findings to report (0 = server maximum). Optional.
    max_findings = 0

    # Whether to include the matching string in the results. Optional.
    include_quote = True

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {'max_findings_per_request': max_findings},
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            # Convert likelihood value to string respresentation.
            likelihood = (google.cloud.dlp.types.Finding.DESCRIPTOR
                          .fields_by_name['likelihood']
                          .enum_type.values_by_number[finding.likelihood]
                          .name)
            print('Likelihood: {}'.format(likelihood))
    else:
        print('No findings.')
Beispiel #28
0
def inspect_file(
    project,
    filename,
    info_types,
    min_likelihood=None,
    custom_dictionaries=None,
    custom_regexes=None,
    max_findings=None,
    include_quote=True,
    mime_type=None,
):
    """Uses the Data Loss Prevention API to analyze a file for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    import mimetypes

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    if not info_types:
        info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"]
    info_types = [{"name": info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [
        {
            "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
            "dictionary": {"word_list": {"words": custom_dict.split(",")}},
        }
        for i, custom_dict in enumerate(custom_dictionaries)
    ]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [
        {
            "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
            "regex": {"pattern": custom_regex},
        }
        for i, custom_regex in enumerate(custom_regexes)
    ]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": info_types,
        "custom_info_types": custom_info_types,
        "min_likelihood": min_likelihood,
        "limits": {"max_findings_per_request": max_findings},
    }

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0]

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        "image/jpeg": 1,
        "image/bmp": 2,
        "image/png": 3,
        "image/svg": 4,
        "text/plain": 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the item, containing the file's byte data.
    with open(filename, mode="rb") as f:
        item = {"byte_item": {"type": content_type_index, "data": f.read()}}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                print("Quote: {}".format(finding.quote))
            except AttributeError:
                pass
            print("Info type: {}".format(finding.info_type.name))
            print("Likelihood: {}".format(finding.likelihood))
    else:
        print("No findings.")
Beispiel #29
0
def main(custom_dictionaries=None, custom_regexes=None, mime_type=None):
    args = parse_arguments()

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(args.file)
        mime_type = mime_guess[0]
    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0, # "Unspecified"
        'image/jpeg': 1,
        'image/bmp': 2,
        'image/png': 3,
        'image/svg': 4,
        'text/plain': 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # The file to inspect
    with open(args.file, mode='rb') as f:
        item = {'byte_item': {'type': content_type_index, 'data': f.read()}}

    # The info types to search for in the content. Required.
    info_types = [{'name': 'FIRST_NAME'}, {'name': 'LAST_NAME'}, {'name': 'CREDIT_CARD_NUMBER'} ]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [{
        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
        'dictionary': {
            'word_list': {'words': custom_dict.split(',')}
        }
    } for i, custom_dict in enumerate(custom_dictionaries)]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [{
        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
        'regex': {'pattern': custom_regex}
    } for i, custom_regex in enumerate(custom_regexes)]
    custom_info_types = dictionaries + regexes

    # The minimum likelihood to constitute a match. Optional.
    min_likelihood = 'LIKELIHOOD_UNSPECIFIED'

    # The maximum number of findings to report (0 = server maximum). Optional.
    max_findings = 0

    # Whether to include the matching string in the results. Optional.
    include_quote = True

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        'info_types': info_types,
        'custom_info_types': custom_info_types,
        'min_likelihood': min_likelihood,
        'include_quote': include_quote,
        'limits': {'max_findings_per_request': max_findings},
    }

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.inspect_content(parent, inspect_config, item)

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                print('Quote: {}'.format(finding.quote))
            except AttributeError:
                pass
            print('Info type: {}'.format(finding.info_type.name))
            # Convert likelihood value to string respresentation.
            print('Likelihood: {}'.format(finding.likelihood))
    else:
        print('No findings.')