Ejemplo n.º 1
0
def sendToDLP(transcript, projectID):
    dlpClient = dlp.DlpServiceClient()
    parent = dlpClient.project_path(projectID)
    # Prepare info_types by converting the list of strings into a list of
    info_types = [
        'PHONE_NUMBER', 'EMAIL_ADDRESS', 'CREDIT_CARD_NUMBER',
        'US_SOCIAL_SECURITY_NUMBER'
    ]
    # dictionaries (protos are also accepted).
    inspect_config = {
        'info_types': [{
            'name': info_type
        } for info_type in info_types]
    }
    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [{
                'primitive_transformation': {
                    'replace_with_info_type_config': {}
                }
            }]
        }
    }

    regex = r".([A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*)(\sat\s+)((?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9]))"

    updatedTranscript = re.sub(regex, r" \1@\3", transcript)

    item = {'value': updatedTranscript}

    # Call the API
    dlpResponse = dlpClient.deidentify_content(
        parent,
        inspect_config=inspect_config,
        deidentify_config=deidentify_config,
        item=item)

    # Print out the results.
    print(dlpResponse.item.value)
Ejemplo n.º 2
0
"""The maximum number of findings to report (0 = server maximum)"""
MAX_FINDINGS = 0
"""The infoTypes of information to match"""
"""For more info visit: https://cloud.google.com/dlp/docs/concepts-infotypes"""
INFO_TYPES = [
    'FIRST_NAME', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'US_SOCIAL_SECURITY_NUMBER'
]
PROJECT_ID = '[PROJECT_ID_FOR_DLP_FINDINGS]'
DATASET_ID = '[DATASET_ID_FOR_DLP_FINDINGS]'
TABLE_ID = '[TABLE_ID_FOR_DLP_FINDINGS]'

# End of User-configurable Constants
# ----------------------------------

# Initialize the Google Cloud client libraries
dlp = dlp.DlpServiceClient()
storage_client = storage.Client()
publisher = pubsub.PublisherClient()
subscriber = pubsub.SubscriberClient()


def create_DLP_job(data, done):
    """This function is triggered by new files uploaded to the designated Cloud Storage quarantine/staging bucket.

       It creates a dlp job for the uploaded file.
    Arg:
       data: The Cloud Storage Event
    Returns:
        None. Debug information is printed to the log.
    """
    # Get the targeted file in the quarantine bucket
def deidentify(file_name, projectID):
    # Instantiates a client
    speechClient = speech.SpeechClient()

    dlpClient = dlp.DlpServiceClient()

    parent = dlpClient.project_path(projectID)

    # Prepare info_types by converting the list of strings into a list of
    info_types = ['PHONE_NUMBER', 'EMAIL_ADDRESS', 'CREDIT_CARD_NUMBER', 'US_SOCIAL_SECURITY_NUMBER']
    # dictionaries (protos are also accepted).
    inspect_config = {
        'info_types': [{'name': info_type} for info_type in info_types]
    }
    # Construct deidentify configuration dictionary
    deidentify_config = {
        'info_type_transformations': {
            'transformations': [
                {
                    'primitive_transformation': {
                        'replace_with_info_type_config': {

                        }
                    }
                }
            ]
        }
    }

    # The name of the audio file and path to transcribe
    #file_name = Path('./resources/sallybrown.flac')


    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='en-US')

    # Detects speech in the audio file
    response = speechClient.recognize(config, audio)

    transcript = ""

    for result in response.results:
        transcript = transcript + result.alternatives[0].transcript;

    print('Original Transcript: {}'.format(transcript))

    # Check transcription for email address, since speech-to-text returns " at " instead of "@"
    # Format with regex before sending to DLP api
    # Currently social security numbers and credit card numbers are interpreted as phone numbers

    regex = r".([A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*)(\sat\s+)((?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9]))"

    updatedTranscript = re.sub(regex, r" \1@\3", transcript)

    print('Email addresses reformatted: {}'.format(updatedTranscript))

    # Construct item
    item = {'value': updatedTranscript}

    # Call the API
    dlpResponse = dlpClient.deidentify_content(
        parent, inspect_config=inspect_config,
        deidentify_config=deidentify_config, item=item)

    # Print out the results.
    print('Final Result with sensitive content redacted: {}'.format(dlpResponse.item.value))