def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(comprehend_error="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] # If operator_object.input["Media"]["Text"]["S3Key"] is a json file, # then we're working with metadata about the text file and need to # get the actual transcript text from the TextTranscriptUri field. # Otherwise we assume operator_object.input["Media"]["Text"]["S3Key"] # contains only the transcript text. file_ext = str(key.split('.')[-1]) if file_ext == "json": obj = s3.get_object( Bucket=bucket, Key=key ) results = obj['Body'].read().decode('utf-8') results_json = json.loads(results) try: uri_data = results_json["TextTranscriptUri"] except KeyError: raise MasExecutionError("JSON can only be passed in from AWS transcribe") else: bucket = uri_data['S3Bucket'] key = uri_data['S3Key'] uri = "s3://" + bucket + '/' + key # If input text is empty then we're done. response = s3.head_object(Bucket=bucket, Key=key) if response['ContentLength'] < 1: operator_object.update_workflow_status("Complete") operator_object.add_workflow_metadata(comprehend_phrases_job_id="Empty input --> empty output.") return operator_object.return_output_object() except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(comprehend_error="No valid inputs") raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' dataplane = DataPlane() output_uri_request = dataplane.generate_media_storage_path(asset_id, workflow_id) output_uri = "s3://{bucket}/{key}".format(bucket=output_uri_request["S3Bucket"], key=output_uri_request["S3Key"] + "/comprehend_phrases") try: comprehend.start_key_phrases_detection_job( InputDataConfig={ 'S3Uri': uri, 'InputFormat': 'ONE_DOC_PER_FILE' }, OutputDataConfig={ 'S3Uri': output_uri }, DataAccessRoleArn=comprehend_role, JobName=workflow_id, LanguageCode='en' ) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(comprehend_error="Unable to get response from comprehend: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: comprehend_job_id = workflow_id operator_object.add_workflow_metadata(comprehend_phrases_job_id=comprehend_job_id, output_uri=output_uri) operator_object.update_workflow_status('Executing') return operator_object.return_output_object()
def lambda_handler(event, context): print("We got the following event:\n", event) try: if "Video" in event["Input"]["Media"]: s3bucket = event["Input"]["Media"]["ProxyEncode"]["S3Bucket"] s3key = event["Input"]["Media"]["ProxyEncode"]["S3Key"] elif "Image" in event["Input"]["Media"]: s3bucket = event["Input"]["Media"]["Image"]["S3Bucket"] s3key = event["Input"]["Media"]["Image"]["S3Key"] workflow_id = str(event["WorkflowExecutionId"]) asset_id = event['AssetId'] except Exception: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( ContentModerationError="No valid inputs") raise MasExecutionError(output_object.return_output_object()) print("Processing s3://" + s3bucket + "/" + s3key) valid_video_types = [".avi", ".mp4", ".mov"] valid_image_types = [".png", ".jpg", ".jpeg"] file_type = os.path.splitext(s3key)[1].lower() if file_type in valid_image_types: # Image processing is synchronous. response = detect_moderation_labels(s3bucket, urllib.parse.unquote_plus(s3key)) output_object.add_workflow_metadata(AssetId=asset_id, WorkflowExecutionId=workflow_id) dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata( asset_id, operator_name, workflow_id, response) if "Status" not in metadata_upload: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( ContentModerationError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id)) raise MasExecutionError(output_object.return_output_object()) else: if metadata_upload["Status"] == "Success": print("Uploaded metadata for asset: {asset}".format( asset=asset_id)) output_object.update_workflow_status("Complete") return output_object.return_output_object() elif metadata_upload["Status"] == "Failed": output_object.update_workflow_status("Error") output_object.add_workflow_metadata( ContentModerationError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id)) raise MasExecutionError(output_object.return_output_object()) else: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( ContentModerationError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id)) raise MasExecutionError(output_object.return_output_object()) elif file_type in valid_video_types: job_id = start_content_moderation(s3bucket, urllib.parse.unquote_plus(s3key)) output_object.update_workflow_status("Executing") output_object.add_workflow_metadata(JobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return output_object.return_output_object() else: print("ERROR: invalid file type") output_object.update_workflow_status("Error") output_object.add_workflow_metadata( ContentModerationError="Not a valid file type") raise MasExecutionError(output_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) try: status = event["Status"] asset_id = event['MetaData']['AssetId'] except KeyError as e: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError="Missing key {e}".format(e=e)) raise MasExecutionError(output_object.return_output_object()) # Images will have already been processed, so return if job status is already set. if status == "Complete": output_object.update_workflow_status("Complete") return output_object.return_output_object() try: job_id = event["MetaData"]["JobId"] workflow_id = event["MetaData"]["WorkflowExecutionId"] except KeyError as e: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError="Missing a required metadata key {e}". format(e=e)) raise MasExecutionError(output_object.return_output_object()) # Check rekognition job status: dataplane = DataPlane() pagination_token = '' is_paginated = False # If pagination token is in event["MetaData"] then use that to start # reading reko results from where this Lambda's previous invocation left off. if ("PageToken" in event["MetaData"]): pagination_token = event["MetaData"]["PageToken"] is_paginated = True # Read and persist 10 reko pages per invocation of this Lambda for page_number in range(11): # Get reko results print("job id: " + job_id + " page token: " + pagination_token) try: response = rek.get_celebrity_recognition( JobId=job_id, NextToken=pagination_token) except rek.exceptions.InvalidPaginationTokenException as e: # Trying to reverse seek to the last valid pagination token would be difficult # to implement, so in the rare case that a pagination token expires we'll # just start over by reading from the first page. print(e) print( "WARNING: Invalid pagination token found. Restarting read from first page." ) pagination_token = '' continue # If the reko job is IN_PROGRESS then return. We'll check again after a step function wait. if response['JobStatus'] == "IN_PROGRESS": output_object.update_workflow_status("Executing") output_object.add_workflow_metadata( JobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return output_object.return_output_object() # If the reko job is FAILED then mark the workflow status as Error and return. elif response['JobStatus'] == "FAILED": output_object.update_workflow_status("Error") output_object.add_workflow_metadata(JobId=job_id, CelebrityRecognitionError=str( response["StatusMessage"])) raise MasExecutionError(output_object.return_output_object()) # If the reko job is SUCCEEDED then save this current reko page result # and continue to next page_number. elif response['JobStatus'] == "SUCCEEDED": # If reko results contain more pages then save this page and continue to the next page if 'NextToken' in response: is_paginated = True # Persist rekognition results (current page) metadata_upload = dataplane.store_asset_metadata( asset_id=asset_id, operator_name=operator_name, workflow_id=workflow_id, results=response, paginate=True, end=False) # If dataplane request succeeded then get the next pagination token and continue. if "Status" in metadata_upload and metadata_upload[ "Status"] == "Success": # Log that this page has been successfully uploaded to the dataplane print( "Uploaded metadata for asset: {asset}, job {JobId}, page {page}" .format(asset=asset_id, JobId=job_id, page=pagination_token)) # Get the next pagination token: pagination_token = response['NextToken'] # In order to avoid Lambda timeouts, we're only going to persist 10 pages then # pass the pagination token to the workflow metadata and let our step function # invoker restart this Lambda. The pagination token allows this Lambda # continue from where it left off. if page_number == 10: output_object.update_workflow_status("Executing") output_object.add_workflow_metadata( PageToken=pagination_token, JobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return output_object.return_output_object() # If dataplane request failed then mark workflow as failed else: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id), JobId=job_id) raise MasExecutionError( output_object.return_output_object()) # If reko results contain no more pages then save this page and mark the stage complete else: # If we've been saving pages, then tell dataplane this is the last page if is_paginated: metadata_upload = dataplane.store_asset_metadata( asset_id=asset_id, operator_name=operator_name, workflow_id=workflow_id, results=response, paginate=True, end=True) # If there is only one page then save to dataplane without dataplane options else: metadata_upload = dataplane.store_asset_metadata( asset_id=asset_id, operator_name=operator_name, workflow_id=workflow_id, results=response) # If dataplane request succeeded then mark the stage complete if "Status" in metadata_upload and metadata_upload[ "Status"] == "Success": print("Uploaded metadata for asset: {asset}".format( asset=asset_id)) output_object.add_workflow_metadata(JobId=job_id) output_object.update_workflow_status("Complete") return output_object.return_output_object() # If dataplane request failed then mark workflow as failed else: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError= "Unable to upload metadata for {asset}: {error}". format(asset=asset_id, error=metadata_upload)) output_object.add_workflow_metadata(JobId=job_id) raise MasExecutionError( output_object.return_output_object()) # If reko job failed then mark workflow as failed else: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError="Unable to determine status") raise MasExecutionError(output_object.return_output_object())
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] # If operator_object.input["Media"]["Text"]["S3Key"] is a json file, # then we're working with metadata about the text file and need to # get the actual transcript text from the TextTranscriptUri field. # Otherwise we assume operator_object.input["Media"]["Text"]["S3Key"] # contains only the transcript text. file_ext = str(key.split('.')[-1]) if file_ext == "json": obj = s3.get_object(Bucket=bucket, Key=key) results = obj['Body'].read().decode('utf-8') results_json = json.loads(results) try: uri_data = results_json["TextTranscriptUri"] except KeyError: raise MasExecutionError( "JSON can only be passed in from AWS transcribe") else: bucket = uri_data['S3Bucket'] key = uri_data['S3Key'] uri = "s3://" + bucket + '/' + key # If input text is empty then we're done. response = s3.head_object(Bucket=bucket, Key=key) # If a KmsKey is specified as an input to this operator, then use that # to enable encryption in the Comprehend job. kms_key_id = "" if "KmsKeyId" in operator_object.configuration: kms_key_id = operator_object.configuration["KmsKeyId"] print( "Found a KMS Key Id. Encryption will be enabled in the Comprehend job." ) else: print( "No KMS Key was specified. Encryption will not be enabled in the Comprehend job." ) if response['ContentLength'] < 1: operator_object.update_workflow_status("Complete") operator_object.add_workflow_metadata( comprehend_entity_job_id="Empty input --> empty output.") return operator_object.return_output_object() except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="No valid inputs") raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' dataplane = DataPlane() output_uri_request = dataplane.generate_media_storage_path( asset_id, workflow_id) output_uri = "s3://{bucket}/{key}".format( bucket=output_uri_request["S3Bucket"], key=output_uri_request["S3Key"] + '/comprehend_entities') try: if kms_key_id != '': # If the user specified a KMS key then enable comprehend job encryption. comprehend.start_entities_detection_job( InputDataConfig={ "S3Uri": uri, "InputFormat": "ONE_DOC_PER_FILE" }, OutputDataConfig={ "S3Uri": output_uri, "KmsKeyId": kms_key_id }, DataAccessRoleArn=comprehend_role, VolumeKmsKeyId=kms_key_id, JobName=workflow_id, LanguageCode="en") else: comprehend.start_entities_detection_job( InputDataConfig={ "S3Uri": uri, "InputFormat": "ONE_DOC_PER_FILE" }, OutputDataConfig={"S3Uri": output_uri}, DataAccessRoleArn=comprehend_role, JobName=workflow_id, LanguageCode="en") except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="Unable to get response from comprehend: {e}". format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: comprehend_job_id = workflow_id operator_object.add_workflow_metadata( comprehend_entity_job_id=comprehend_job_id, entity_output_uri=output_uri) operator_object.update_workflow_status('Executing') return operator_object.return_output_object()
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: workflow_id = operator_object.workflow_execution_id asset_id = operator_object.asset_id job_id = workflow_id except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="No valid job id") raise MasExecutionError(operator_object.return_output_object()) try: response = comprehend.list_key_phrases_detection_jobs(Filter={ 'JobName': job_id, }, ) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="Unable to get response from comprehend: {e}". format(e=e)) raise MasExecutionError(operator_object.return_output_object()) else: print(response) comprehend_status = response["KeyPhrasesDetectionJobPropertiesList"][ 0]["JobStatus"] if comprehend_status == "SUBMITTED" or comprehend_status == "IN_PROGRESS": operator_object.add_workflow_metadata( comprehend_phrases_job_id=job_id) operator_object.update_workflow_status("Executing") return operator_object.return_output_object() elif comprehend_status == "COMPLETED": output_uri = response["KeyPhrasesDetectionJobPropertiesList"][0][ "OutputDataConfig"]["S3Uri"] delimeter = '/' bucket = delimeter.join(output_uri.split(delimeter)[2:3]) file_name = output_uri.split(delimeter)[-1] key = delimeter.join( output_uri.split(delimeter)[3:-1]) + '/' + file_name comprehend_tarball = read_from_s3(bucket, key) comprehend_data = { "LanguageCode": response['KeyPhrasesDetectionJobPropertiesList'][0] ['LanguageCode'], "Results": [] } if comprehend_tarball["Status"] == "Success": input_bytes = comprehend_tarball["Object"] with tarfile.open(fileobj=BytesIO(input_bytes)) as tf: for member in tf: if member.isfile(): comprehend_data["Results"].append( tf.extractfile(member).read().decode('utf-8')) dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata( asset_id, "key_phrases", workflow_id, comprehend_data) if "Status" not in metadata_upload: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="Unable to store key phrases data {e}" .format(e=metadata_upload)) raise MasExecutionError( operator_object.return_output_object()) else: if metadata_upload["Status"] == "Success": operator_object.update_workflow_status("Complete") operator_object.add_workflow_metadata( comprehend_entity_job_id=job_id, output_uri=output_uri) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error= "Unable to store key phrases data {e}".format( e=metadata_upload)) raise MasExecutionError( operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_entity_job_id=job_id, comprehend_error="could not retrieve output from s3: {e}". format(e=comprehend_tarball["Message"])) raise MasExecutionError(operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_phrases_job_id=job_id, comprehend_error="comprehend returned as failed: {e}".format( e=response["KeyPhrasesDetectionJobPropertiesList"][0] ["Message"])) raise MasExecutionError(operator_object.return_output_object())
import html import webvtt from io import StringIO from botocore import config from urllib.parse import urlparse from datetime import datetime from MediaInsightsEngineLambdaHelper import MediaInsightsOperationHelper from MediaInsightsEngineLambdaHelper import MasExecutionError from MediaInsightsEngineLambdaHelper import DataPlane s3 = boto3.client('s3') s3_resource = boto3.resource('s3') urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) headers = {"Content-Type": "application/json"} dataplane = DataPlane() mie_config = json.loads(os.environ['botoConfig']) config = config.Config(**mie_config) translate_client = boto3.client('translate', config=config) polly = boto3.client('polly', config=config) class WebCaptions: def __init__(self, operator_object): """ :param event: The event passed in to the operator """ print("WebCaptions operator_object = {}".format(operator_object)) self.operator_object = operator_object
def lambda_handler(event, context): operator_object = MediaInsightsOperationHelper(event) try: job_id = operator_object.metadata["TranscribeJobId"] workflow_id = operator_object.workflow_execution_id asset_id = operator_object.asset_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: response = transcribe.get_transcription_job( TranscriptionJobName=job_id) print(response) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranscribeError=str(e), TranscribeJobId=job_id) raise MasExecutionError(operator_object.return_output_object()) else: if response["TranscriptionJob"][ "TranscriptionJobStatus"] == "IN_PROGRESS": operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( TranscribeJobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "FAILED": operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeJobId=job_id, TranscribeError=str( response["TranscriptionJob"]["FailureReason"])) raise MasExecutionError(operator_object.return_output_object()) elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "COMPLETED": transcribe_uri = response["TranscriptionJob"]["Transcript"][ "TranscriptFileUri"] http = urllib3.PoolManager() transcription = http.request('GET', transcribe_uri) transcription_data = transcription.data.decode("utf-8") transcription_json = json.loads(transcription_data) text_only_transcript = '' for transcripts in transcription_json["results"]["transcripts"]: transcript = transcripts["transcript"] text_only_transcript = text_only_transcript.join(transcript) print(text_only_transcript) dataplane = DataPlane() s3 = boto3.client('s3') transcript_storage_path = dataplane.generate_media_storage_path( asset_id, workflow_id) key = transcript_storage_path['S3Key'] + "transcript.txt" bucket = transcript_storage_path['S3Bucket'] s3.put_object(Bucket=bucket, Key=key, Body=text_only_transcript) transcription_json["TextTranscriptUri"] = { "S3Bucket": bucket, "S3Key": key } metadata_upload = dataplane.store_asset_metadata( asset_id, operator_object.name, workflow_id, transcription_json) if "Status" not in metadata_upload: operator_object.add_workflow_metadata( TranscribeError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id), TranscribeJobId=job_id) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload['Status'] == 'Success': operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key']) operator_object.add_workflow_metadata( TranscribeJobId=job_id) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( TranscribeError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id), TranscribeJobId=job_id) operator_object.update_workflow_status("Error") raise MasExecutionError( operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="Unable to determine status") raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) bucket = '' key = '' try: if "Video" in event["Input"]["Media"]: bucket = event["Input"]["Media"]["Video"]["S3Bucket"] key = event["Input"]["Media"]["Video"]["S3Key"] elif "Image" in event["Input"]["Media"]: bucket = event["Input"]["Media"]["Image"]["S3Bucket"] key = event["Input"]["Media"]["Image"]["S3Key"] workflow_id = str(operator_object.workflow_execution_id) except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) # Adding in exception block for now since we aren't guaranteed an asset id will be present, should remove later try: asset_id = operator_object.asset_id except KeyError as e: print("No asset id passed in with this workflow", e) asset_id = '' # Get metadata s3_cli = boto3.client("s3", region_name=region, config=Config(signature_version='s3v4', s3={'addressing_style': 'virtual'})) metadata_json = {} try: # The number of seconds that the Signed URL is valid: signed_url_expiration = 300 # Generate a signed URL for reading a file from S3 via HTTPS signed_url = s3_cli.generate_presigned_url('get_object', Params={'Bucket': bucket, 'Key': key}, ExpiresIn=signed_url_expiration) # Launch MediaInfo media_info = MediaInfo.parse(signed_url) # Save the result metadata_json = json.loads(media_info.to_json()) # If there's no Video, Audio, Image, or Text data then delete the file. track_types = [track['track_type'] for track in metadata_json['tracks']] if ('Video' not in track_types and 'Audio' not in track_types and 'Image' not in track_types and 'Text' not in track_types): print("ERROR: File does not contain valid video, audio, image, or text content") print("Deleting file s3://" + bucket + "/" + key) s3_cli.delete_object(Bucket=bucket, Key=key) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediainfoError="File does not contain valid video, audio, image, or text content") raise MasExecutionError(operator_object.return_output_object()) except RuntimeError as e: # If MediaInfo could not run then we assume it is not a valid # media file and delete it print("Exception:\n", e) print("ERROR: File does not contain valid video, audio, image, or text content") print("Deleting file s3://" + bucket + "/" + key) s3_cli.delete_object(Bucket=bucket, Key=key) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediainfoError="File does not contain valid video, audio, image, or text content") raise MasExecutionError(operator_object.return_output_object()) except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediainfoError="Unable to get Mediainfo results. " + str(e)) raise MasExecutionError(operator_object.return_output_object()) # Verify that the metadata is a dict, as required by the dataplane if type(metadata_json) != dict: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( MediainfoError="Metadata must be of type dict. Found " + str(type(metadata_json)) + " instead.") raise MasExecutionError(operator_object.return_output_object()) # Pass metadata to downstream operators # Number of audio tracks is used by the Transcribe operator num_audio_tracks = len(list(filter(lambda i: i['track_type'] == 'Audio', metadata_json['tracks']))) operator_object.add_workflow_metadata(Mediainfo_num_audio_tracks=str(num_audio_tracks)) # Save metadata to dataplane operator_object.add_workflow_metadata(AssetId=asset_id, WorkflowExecutionId=workflow_id) dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, metadata_json) # Validate that the metadata was saved to the dataplane if "Status" not in metadata_upload: operator_object.add_workflow_metadata( MediainfoError="Unable to upload metadata for asset: {asset}".format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: # Update the workflow status if metadata_upload["Status"] == "Success": print("Uploaded metadata for asset: {asset}".format(asset=asset_id)) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( MediainfoError="Unable to upload metadata for asset: {asset}".format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) # Get operator parameters try: workflow_id = str(event["WorkflowExecutionId"]) asset_id = event['AssetId'] if "Video" in operator_object.input["Media"]: bucket = operator_object.input["Media"]["Video"]["S3Bucket"] key = operator_object.input["Media"]["Video"]["S3Key"] file_type = key.split('.')[-1] elif "Audio" in operator_object.input["Media"]: bucket = operator_object.input["Media"]["Audio"]["S3Bucket"] key = operator_object.input["Media"]["Audio"]["S3Key"] file_type = key.split('.')[-1] elif "Image" in operator_object.input["Media"]: bucket = operator_object.input["Media"]["Image"]["S3Bucket"] key = operator_object.input["Media"]["Image"]["S3Key"] file_type = key.split('.')[-1] elif "Text" in operator_object.input["Media"]: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] file_type = key.split('.')[-1] except Exception: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( GenericDataLookupError="No valid inputs") raise MasExecutionError(operator_object.return_output_object()) # Get the metadata filename print("Looking up metadata for s3://" + bucket + "/" + key) # Get user-defined location for generic data file if "Key" in operator_object.configuration: metadata_filename = operator_object.configuration["Key"] else: operator_object.add_workflow_metadata( GenericDataLookupError="Missing S3 key for data file.") operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) if "Bucket" in operator_object.configuration: metadata_bucket = operator_object.configuration["Bucket"] else: operator_object.add_workflow_metadata( GenericDataLookupError="Missing S3 bucket for data file.") operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) # Get metadata s3 = boto3.client('s3') try: print("Getting data from s3://" + metadata_bucket + "/" + metadata_filename) data = s3.get_object(Bucket=metadata_bucket, Key=metadata_filename) metadata_json = json.loads(data['Body'].read().decode('utf-8')) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( GenericDataLookupError="Unable read datafile. " + str(e)) raise MasExecutionError(operator_object.return_output_object()) # Verify that the metadata is a dict, as required by the dataplane if (type(metadata_json) != dict): operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( GenericDataLookupError="Metadata must be of type dict. Found " + str(type(metadata_json)) + " instead.") raise MasExecutionError(operator_object.return_output_object()) # Save metadata to dataplane operator_object.add_workflow_metadata(AssetId=asset_id, WorkflowExecutionId=workflow_id) dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, metadata_json) # Validate that the metadata was saved to the dataplane if "Status" not in metadata_upload: operator_object.add_workflow_metadata( GenericDataLookupError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: # Update the workflow status if metadata_upload["Status"] == "Success": print( "Uploaded metadata for asset: {asset}".format(asset=asset_id)) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( GenericDataLookupError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): try: status = event["Status"] asset_id = event['MetaData']['AssetId'] except KeyError as e: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError="Missing key {e}".format(e=e)) raise MasExecutionError(output_object.return_output_object()) # Images will have already been processed, so return if job status is already set. if status == "Complete": # TODO: Persist rekognition output output_object.update_workflow_status("Complete") return output_object.return_output_object() try: job_id = event["MetaData"]["CelebrityRecognitionJobId"] workflow_id = event["MetaData"]["WorkflowExecutionId"] except KeyError as e: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError="Missing a required metadata key {e}". format(e=e)) raise MasExecutionError(output_object.return_output_object()) # Check rekognition job status: rek = boto3.client('rekognition') dataplane = DataPlane() max_results = 1000 pagination_token = '' finished = False # Pagination starts on 1001th result. This while loops through each page. while not finished: response = rek.get_celebrity_recognition(JobId=job_id, MaxResults=max_results, NextToken=pagination_token) if response['JobStatus'] == "IN_PROGRESS": finished = True output_object.update_workflow_status("Executing") output_object.add_workflow_metadata( CelebrityRecognitionJobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return output_object.return_output_object() elif response['JobStatus'] == "FAILED": finished = True output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionJobId=job_id, CelebrityRecognitionError=str(response["StatusMessage"])) raise MasExecutionError(output_object.return_output_object()) elif response['JobStatus'] == "SUCCEEDED": if 'NextToken' in response: pagination_token = response['NextToken'] # Persist rekognition results (current page) metadata_upload = dataplane.store_asset_metadata( asset_id, operator_name, workflow_id, response) if "Status" not in metadata_upload: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id)) raise MasExecutionError( output_object.return_output_object()) else: if metadata_upload["Status"] == "Success": print("Uploaded metadata for asset: {asset}".format( asset=asset_id)) elif metadata_upload["Status"] == "Failed": output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError= "Unable to upload metadata for asset: {asset}". format(asset=asset_id)) raise MasExecutionError( output_object.return_output_object()) else: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError= "Unable to upload metadata for asset: {asset}". format(asset=asset_id)) output_object.add_workflow_metadata( PersonTrackingJobId=job_id) raise MasExecutionError( output_object.return_output_object()) else: finished = True # Persist rekognition results metadata_upload = dataplane.store_asset_metadata( asset_id, operator_name, workflow_id, response) if "Status" not in metadata_upload: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id)) raise MasExecutionError( output_object.return_output_object()) else: if metadata_upload["Status"] == "Success": print("Uploaded metadata for asset: {asset}".format( asset=asset_id)) elif metadata_upload["Status"] == "Failed": output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError= "Unable to upload metadata for asset: {asset}". format(asset=asset_id)) raise MasExecutionError( output_object.return_output_object()) else: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError= "Unable to upload metadata for asset: {asset}". format(asset=asset_id)) output_object.add_workflow_metadata( PersonTrackingJobId=job_id) raise MasExecutionError( output_object.return_output_object()) output_object.add_workflow_metadata( CelebrityRecognitionJobId=job_id) output_object.update_workflow_status("Complete") return output_object.return_output_object() else: output_object.update_workflow_status("Error") output_object.add_workflow_metadata( CelebrityRecognitionError="Unable to determine status") raise MasExecutionError(output_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="No valid inputs {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' try: source_lang = operator_object.configuration["SourceLanguageCode"] target_lang = operator_object.configuration["TargetLanguageCode"] except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Language codes are not defined") raise MasExecutionError(operator_object.return_output_object()) try: s3_response = s3.get_object(Bucket=bucket, Key=key) transcribe_metadata = json.loads( s3_response["Body"].read().decode("utf-8")) transcript = transcribe_metadata["results"]["transcripts"][0][ "transcript"] except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Unable to read transcription from S3: {e}".format( e=str(e))) raise MasExecutionError(operator_object.return_output_object()) # If input text is empty then we're done. if len(transcript) < 1: operator_object.update_workflow_status("Complete") return operator_object.return_output_object() # Tell the NLTK data loader to look for files in /tmp/ nltk.data.path.append("/tmp/") # Download NLTK tokenizers to /tmp/ # We use /tmp because that's where AWS Lambda provides write access to the local file system. nltk.download('punkt', download_dir='/tmp/') # Create language tokenizer according to user-specified source language. # Default to English. if source_lang == 'fr': print("Using French dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/french.pickle') elif source_lang == 'de': print("Using German dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/german.pickle') elif source_lang == 're': print("Using Russian dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle') elif source_lang == 'it': print("Using Italian dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle') elif source_lang == 'pt': print("Using Portuguese dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle') elif source_lang == 'es': print("Using Spanish dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') else: print("Using English dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Split input text into a list of sentences sentences = tokenizer.tokenize(transcript) print("Input text length: " + str(len(transcript))) print("Number of sentences: " + str(len(sentences))) translated_text = '' transcript_chunk = '' for sentence in sentences: # Translate can handle 5000 unicode characters but we'll process no # more than 1000 just to be on the safe side. # Even by limiting input text to 3000 characters, we've still seen # translate throttling with a RateExceeded exception. # Reducing input text to 1000 characters seemed to fix this. if (len(sentence) + len(transcript_chunk) < 1000): transcript_chunk = transcript_chunk + ' ' + sentence else: try: print("Translation input text length: " + str(len(transcript_chunk))) translation_chunk = translate_client.translate_text( Text=transcript_chunk, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang) print("Translation output text length: " + str(len(translation_chunk))) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Unable to get response from translate: {e}" .format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) translated_text = translated_text + ' ' + translation_chunk[ "TranslatedText"] transcript_chunk = sentence print("Translating the final chunk of input text...") try: print("Translation input text length: " + str(len(transcript_chunk))) translation_chunk = translate_client.translate_text( Text=transcript_chunk, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang) print("Translation output text length: " + str(len(translation_chunk))) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Unable to get response from translate: {e}".format( e=str(e))) raise MasExecutionError(operator_object.return_output_object()) translated_text = translated_text + ' ' + translation_chunk[ "TranslatedText"] # Put final result into a JSON object because the MIE dataplane requires it to be so. translation_result = {} translation_result["TranslatedText"] = translated_text translation_result["SourceLanguageCode"] = source_lang translation_result["TargetLanguageCode"] = target_lang print("Final translation text length: " + str(len(translated_text))) dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, translation_result) if "Status" not in metadata_upload: operator_object.add_workflow_metadata( TranslateError="Unable to upload metadata for asset: {asset}". format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload['Status'] == 'Success': operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key']) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( TranslateError="Unable to upload metadata for asset: {asset}". format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError="No valid inputs {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' try: source_lang = operator_object.configuration["SourceLanguageCode"] target_lang = operator_object.configuration["TargetLanguageCode"] except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError="Language codes are not defined") raise MasExecutionError(operator_object.return_output_object()) try: s3_response = s3.get_object(Bucket=bucket, Key=key) transcribe_metadata = json.loads(s3_response["Body"].read().decode("utf-8")) transcript = transcribe_metadata["results"]["transcripts"][0]["transcript"] except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError="Unable to read transcription from S3: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) try: translation = translate_client.translate_text( Text=transcript, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang ) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError="Unable to get response from translate: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, translation) if "Status" not in metadata_upload: operator_object.add_workflow_metadata( TranslateError="Unable to upload metadata for asset: {asset}".format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload['Status'] == 'Success': operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key']) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( TranslateError="Unable to upload metadata for asset: {asset}".format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) # If Transcribe wasn't run due to silent audio, then we're done if "Mediainfo_num_audio_tracks" in event["Input"]["MetaData"] and event[ "Input"]["MetaData"]["Mediainfo_num_audio_tracks"] == "0": operator_object.update_workflow_status("Complete") return operator_object.return_output_object() try: job_id = operator_object.metadata["TranscribeJobId"] workflow_id = operator_object.workflow_execution_id asset_id = operator_object.asset_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: response = transcribe.get_transcription_job( TranscriptionJobName=job_id) source_language = response['TranscriptionJob']['LanguageCode'] print("get_transcription_job response:") print(response) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError=str(e), TranscribeJobId=job_id, ) raise MasExecutionError(operator_object.return_output_object()) else: if response["TranscriptionJob"][ "TranscriptionJobStatus"] == "IN_PROGRESS": operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( TranscribeJobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "FAILED": operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeJobId=job_id, TranscribeError=str( response["TranscriptionJob"]["FailureReason"])) raise MasExecutionError(operator_object.return_output_object()) elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "COMPLETED": transcribe_uri = response["TranscriptionJob"]["Transcript"][ "TranscriptFileUri"] http = urllib3.PoolManager() transcription = http.request('GET', transcribe_uri) transcription_data = transcription.data.decode("utf-8") transcription_json = json.loads(transcription_data) text_only_transcript = '' for transcripts in transcription_json["results"]["transcripts"]: transcript = transcripts["transcript"] text_only_transcript = text_only_transcript.join(transcript) print(text_only_transcript) dataplane = DataPlane() s3 = boto3.client('s3', config=config) transcript_storage_path = dataplane.generate_media_storage_path( asset_id, workflow_id) key = transcript_storage_path['S3Key'] + "transcript.txt" bucket = transcript_storage_path['S3Bucket'] s3.put_object(Bucket=bucket, Key=key, Body=text_only_transcript) transcription_json["TextTranscriptUri"] = { "S3Bucket": bucket, "S3Key": key } metadata_upload = dataplane.store_asset_metadata( asset_id, operator_object.name, workflow_id, transcription_json) if "Status" not in metadata_upload: operator_object.add_workflow_metadata( TranscribeError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id), TranscribeJobId=job_id) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload['Status'] == 'Success': operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key']) # The source language may be user-specified or auto-detected by # Transcribe. Either way, pass it to downstream operators as # workflow metadata. operator_object.add_workflow_metadata( TranscribeJobId=job_id, TranscribeSourceLanguage=source_language) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( TranscribeError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id), TranscribeJobId=job_id) operator_object.update_workflow_status("Error") raise MasExecutionError( operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="Unable to determine status") raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): #print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(comprehend_error="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] file_ext = str(key.split('.')[-1]) if file_ext == "json": s3 = boto3.client('s3') obj = s3.get_object( Bucket=bucket, Key=key ) results = obj['Body'].read().decode('utf-8') results_json = json.loads(results) try: uri_data = results_json["TextTranscriptUri"] except KeyError: raise MasExecutionError("JSON can only be passed in from AWS transcribe") else: uri = "s3://" + uri_data['S3Bucket'] + '/' + uri_data['S3Key'] else: uri = "s3://" + bucket + '/' + key except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(comprehend_error="No valid inputs") raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' dataplane = DataPlane() output_uri_request = dataplane.generate_media_storage_path(asset_id, workflow_id) output_uri = "s3://{bucket}/{key}".format(bucket=output_uri_request["S3Bucket"], key=output_uri_request["S3Key"] + '/comprehend_entities') try: comprehend.start_entities_detection_job( InputDataConfig={ "S3Uri": uri, "InputFormat": "ONE_DOC_PER_FILE" }, OutputDataConfig={ "S3Uri": output_uri }, DataAccessRoleArn=comprehend_role, JobName=workflow_id, LanguageCode="en" ) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(comprehend_error="Unable to get response from comprehend: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: comprehend_job_id = workflow_id operator_object.add_workflow_metadata(comprehend_entity_job_id=comprehend_job_id, entity_output_uri=output_uri) operator_object.update_workflow_status('Executing') return operator_object.return_output_object()