def create_vtt(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) webcaptions_object = WebCaptions(operator_object) try: targetLanguageCodes = webcaptions_object.operator_object.configuration[ "TargetLanguageCodes"] except KeyError as e: webcaptions_object.operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( WebCaptionsError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) captions_collection = [] for lang in targetLanguageCodes: webcaptions = [] webcaptions = webcaptions_object.GetWebCaptions(lang) #captions = get_webcaptions_json(self.operator_object, lang) vtt = webcaptions_object.WebCaptionsToVTT(webcaptions) metadata = webcaptions_object.PutVTT(lang, vtt) captions_collection.append(metadata) data = {} data["CaptionsCollection"] = captions_collection webcaptions_object.PutMediaCollection(operator_object.name, data) operator_object.update_workflow_status("Complete") return operator_object.return_output_object()
def start_translate_webcaptions(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) webcaptions_object = WebCaptions(operator_object) try: source_lang = operator_object.configuration["SourceLanguageCode"] target_langs = operator_object.configuration["TargetLanguageCodes"] except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Language codes are not defined") raise MasExecutionError(operator_object.return_output_object()) try: terminology_names = operator_object.configuration["TerminologyNames"] except KeyError: terminology_names = [] #webcaptions = get_webcaptions(operator_object, source_lang) webcaptions = webcaptions_object.GetWebCaptions(source_lang) # Translate takes a list of target languages, but it only allow on item in the list. Too bad # life would be so much easier if it truely allowed many targets. webcaptions_object.TranslateWebCaptions(webcaptions, source_lang, target_langs, terminology_names) return operator_object.return_output_object()
def start_wait_operation_lambda(event, context): ''' Pause a workflow to wait for external processing event is - Operation input - Operation configuration returns: Operation output ''' logger.info(json.dumps(event)) operator_object = MediaInsightsOperationHelper(event) try: update_workflow_execution_status(operator_object.workflow_execution_id, awsmie.WORKFLOW_STATUS_WAITING, "") except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( WaitError="Unable to set workflow status to Waiting {e}".format( e=str(e))) raise MasExecutionError(operator_object.return_output_object()) return operator_object.return_output_object()
def web_captions(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) webcaptions_object = WebCaptions(operator_object) transcript = webcaptions_object.GetTranscript() webcaptions = webcaptions_object.TranscribeToWebCaptions(transcript) webcaptions_object.PutWebCaptions(webcaptions) operator_object.update_workflow_status("Complete") return operator_object.return_output_object()
def test_lambda_handler(event, context, operator_name, mediaType, status, type): try: print(json.dumps(event)) # set output status, media, and metatdata for workflow - these get passed to other # stages of the workflow through the control plane dataplane = DataPlane() operator_object = MediaInsightsOperationHelper(event) operator_object.update_workflow_status("Complete") metadata = {} metadata[operator_object.name] = { "Meta": "Workflow metadata for " + operator_object.name } if "TestCustomConfig" in operator_object.configuration: metadata[operator_object. name]["TestCustomConfig"] = operator_object.configuration[ "TestCustomConfig"] operator_object.add_workflow_metadata_json(metadata) if "OutputMediaType" in operator_object.configuration: mediaType = operator_object.configuration["OutputMediaType"] if mediaType == "Video": operator_object.add_media_object( "Video", "S3BucketFrom{}".format(operator_object.name), "S3/Key/From/{}/video".format(operator_object.name)) elif mediaType == "Audio": operator_object.add_media_object( "Audio", "S3BucketFrom{}".format(operator_object.name), "S3/Key/From/{}/audio".format(operator_object.name)) elif mediaType == "Image": operator_object.add_media_object( "Text", "S3BucketFrom{}".format(operator_object.name), "S3/Key/From/{}/image".format(operator_object.name)) elif mediaType == "Text": operator_object.add_media_object( "Text", "S3BucketFrom{}".format(operator_object.name), "S3/Key/From/{}/text".format(operator_object.name)) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( Message="Oh no! Something went wrong: {}".format(str(e))) raise MasExecutionError(operator_object.return_output_object()) else: if status == "Fail": operator_object.update_workflow_status("Error") else: operator_object.update_workflow_status("Complete") return operator_object.return_output_object()
def filter_operation_lambda(event, context): ''' event is - Operation input - Operation configuration returns: Operation output - Operation status "Skipped" if operation should be skipped ''' logger.info(json.dumps(event)) operation_object = MediaInsightsOperationHelper(event) if operation_object.configuration[ "MediaType"] != "MetadataOnly" and operation_object.configuration[ "MediaType"] not in operation_object.input["Media"]: operation_object.update_workflow_status( awsmie.OPERATION_STATUS_SKIPPED) elif operation_object.configuration["Enabled"] == False: operation_object.update_workflow_status( awsmie.OPERATION_STATUS_SKIPPED) else: operation_object.update_workflow_status( awsmie.OPERATION_STATUS_STARTED) return operation_object.return_output_object()
def web_captions(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) webcaptions_object = WebCaptions(operator_object) transcript = webcaptions_object.GetTranscript() webcaptions = webcaptions_object.TranscribeToWebCaptions(transcript) # Save the the original Transcribe generated captions to compare to any ground truth modifications # made later so we can calculate quality metrics of the machine translation webcaptions_object.PutWebCaptions(webcaptions, source="TranscribeVideo") # if a vtt file was input, use that as the most recent version of the webcaptions file if webcaptions_object.existing_subtitles: webcaptions = vttToWebCaptions(operator_object, webcaptions_object.existing_subtitles_object) webcaptions_object.PutWebCaptions(webcaptions) operator_object.update_workflow_status("Complete") return operator_object.return_output_object()
def check_wait_operation_lambda(event, context): ''' Check if a workflow is still in a Waiting state. event is - Operation input - Operation configuration returns: Operation output ''' logger.info(json.dumps(event)) operator_object = MediaInsightsOperationHelper(event) execution_table = DYNAMO_RESOURCE.Table(WORKFLOW_EXECUTION_TABLE_NAME) response = execution_table.get_item( Key={ 'Id': operator_object.workflow_execution_id }, ConsistentRead=True) if "Item" in response: workflow_execution = response["Item"] else: workflow_execution = None # raise ChaliceViewError( operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(WaitError="Unable to find Waiting workflow execution {} {e}".format( operator_object.workflow_execution_id, e=str(e))) raise MasExecutionError(operator_object.return_output_object()) logger.info("workflow_execution: {}".format( json.dumps(workflow_execution))) if workflow_execution["Status"] == awsmie.WORKFLOW_STATUS_WAITING: operator_object.update_workflow_status("Executing") return operator_object.return_output_object() elif workflow_execution["Status"] == awsmie.WORKFLOW_STATUS_STARTED: operator_object.update_workflow_status("Complete") else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(WaitError="Unexpected workflow execution status {}".format( workflow_execution["Status"])) raise MasExecutionError(operator_object.return_output_object()) return operator_object.return_output_object()
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) # Get media metadata from input event try: asset_id = operator_object.asset_id bucket = operator_object.input["Media"]["Video"]["S3Bucket"] except Exception as exception: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( VmapGenerationError="Missing a required metadata key {e}".format( e=exception)) raise MasExecutionError(operator_object.return_output_object()) # Get slots metadata from dataplane try: slots = {} params = {"asset_id": asset_id, "operator_name": "slotDetection"} while True: resp = dataplane.retrieve_asset_metadata(**params) if "operator" in resp and resp["operator"] == "slotDetection": __update_and_merge_lists(slots, resp["results"]) if "cursor" not in resp: break params["cursor"] = resp["cursor"] print("slots: {}".format(slots)) except Exception as exception: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( VmapGenerationError="Unable to retrieve metadata for asset {}: {}". format(asset_id, exception)) raise MasExecutionError(operator_object.return_output_object()) try: # Select slots with highest scores slots["slots"].sort(key=lambda slot: slot["Score"]) top_slots = slots["slots"][-top_slots_qty:] # Generate VMAP and add object key = 'private/assets/{}/vmap/ad_breaks.vmap'.format(asset_id) __write_vmap(top_slots, bucket, key) operator_object.add_media_object("VMAP", bucket, key) # Set workflow status complete operator_object.update_workflow_status("Complete") return operator_object.return_output_object() except Exception as exception: print("Exception:\n", exception) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(VmapGenerationError=exception) raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(comprehend_error="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] # If operator_object.input["Media"]["Text"]["S3Key"] is a json file, # then we're working with metadata about the text file and need to # get the actual transcript text from the TextTranscriptUri field. # Otherwise we assume operator_object.input["Media"]["Text"]["S3Key"] # contains only the transcript text. file_ext = str(key.split('.')[-1]) if file_ext == "json": obj = s3.get_object( Bucket=bucket, Key=key ) results = obj['Body'].read().decode('utf-8') results_json = json.loads(results) try: uri_data = results_json["TextTranscriptUri"] except KeyError: raise MasExecutionError("JSON can only be passed in from AWS transcribe") else: bucket = uri_data['S3Bucket'] key = uri_data['S3Key'] uri = "s3://" + bucket + '/' + key # If input text is empty then we're done. response = s3.head_object(Bucket=bucket, Key=key) if response['ContentLength'] < 1: operator_object.update_workflow_status("Complete") operator_object.add_workflow_metadata(comprehend_phrases_job_id="Empty input --> empty output.") return operator_object.return_output_object() except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(comprehend_error="No valid inputs") raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' dataplane = DataPlane() output_uri_request = dataplane.generate_media_storage_path(asset_id, workflow_id) output_uri = "s3://{bucket}/{key}".format(bucket=output_uri_request["S3Bucket"], key=output_uri_request["S3Key"] + "/comprehend_phrases") try: comprehend.start_key_phrases_detection_job( InputDataConfig={ 'S3Uri': uri, 'InputFormat': 'ONE_DOC_PER_FILE' }, OutputDataConfig={ 'S3Uri': output_uri }, DataAccessRoleArn=comprehend_role, JobName=workflow_id, LanguageCode='en' ) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(comprehend_error="Unable to get response from comprehend: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: comprehend_job_id = workflow_id operator_object.add_workflow_metadata(comprehend_phrases_job_id=comprehend_job_id, output_uri=output_uri) operator_object.update_workflow_status('Executing') return operator_object.return_output_object()
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: job_id = operator_object.metadata["comprehend_phrases_job_id"] asset_id = operator_object.asset_id workflow_id = operator_object.workflow_execution_id # If Comprehend wasn't run due to empty text input, then we're done if job_id == "Empty input --> empty output.": operator_object.update_workflow_status("Complete") return operator_object.return_output_object() except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="No valid job id") raise MasExecutionError(operator_object.return_output_object()) try: response = comprehend.list_key_phrases_detection_jobs(Filter={ 'JobName': job_id, }, ) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="Unable to get response from comprehend: {e}". format(e=e)) raise MasExecutionError(operator_object.return_output_object()) else: print(response) comprehend_status = response["KeyPhrasesDetectionJobPropertiesList"][ 0]["JobStatus"] if comprehend_status == "SUBMITTED" or comprehend_status == "IN_PROGRESS": operator_object.add_workflow_metadata( comprehend_phrases_job_id=job_id) operator_object.update_workflow_status("Executing") return operator_object.return_output_object() elif comprehend_status == "COMPLETED": output_uri = response["KeyPhrasesDetectionJobPropertiesList"][0][ "OutputDataConfig"]["S3Uri"] delimeter = '/' bucket = delimeter.join(output_uri.split(delimeter)[2:3]) file_name = output_uri.split(delimeter)[-1] key = delimeter.join( output_uri.split(delimeter)[3:-1]) + '/' + file_name comprehend_tarball = read_from_s3(bucket, key) comprehend_data = { "LanguageCode": response['KeyPhrasesDetectionJobPropertiesList'][0] ['LanguageCode'], "Results": [] } if comprehend_tarball["Status"] == "Success": input_bytes = comprehend_tarball["Object"] with tarfile.open(fileobj=BytesIO(input_bytes)) as tf: for member in tf: if member.isfile(): comprehend_data["Results"].append( tf.extractfile(member).read().decode('utf-8')) dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata( asset_id, "key_phrases", workflow_id, comprehend_data) if "Status" not in metadata_upload: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="Unable to store key phrases data {e}" .format(e=metadata_upload)) raise MasExecutionError( operator_object.return_output_object()) else: if metadata_upload["Status"] == "Success": operator_object.add_workflow_metadata( comprehend_entity_job_id=job_id, output_uri=output_uri) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error= "Unable to store key phrases data {e}".format( e=metadata_upload)) raise MasExecutionError( operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_entity_job_id=job_id, comprehend_error="could not retrieve output from s3: {e}". format(e=comprehend_tarball["Message"])) raise MasExecutionError(operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_phrases_job_id=job_id, comprehend_error="comprehend returned as failed: {e}".format( e=response["KeyPhrasesDetectionJobPropertiesList"][0] ["Message"])) raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] # If operator_object.input["Media"]["Text"]["S3Key"] is a json file, # then we're working with metadata about the text file and need to # get the actual transcript text from the TextTranscriptUri field. # Otherwise we assume operator_object.input["Media"]["Text"]["S3Key"] # contains only the transcript text. file_ext = str(key.split('.')[-1]) if file_ext == "json": obj = s3.get_object(Bucket=bucket, Key=key) results = obj['Body'].read().decode('utf-8') results_json = json.loads(results) try: uri_data = results_json["TextTranscriptUri"] except KeyError: raise MasExecutionError( "JSON can only be passed in from AWS transcribe") else: bucket = uri_data['S3Bucket'] key = uri_data['S3Key'] uri = "s3://" + bucket + '/' + key # If input text is empty then we're done. response = s3.head_object(Bucket=bucket, Key=key) # If a KmsKey is specified as an input to this operator, then use that # to enable encryption in the Comprehend job. kms_key_id = "" if "KmsKeyId" in operator_object.configuration: kms_key_id = operator_object.configuration["KmsKeyId"] print( "Found a KMS Key Id. Encryption will be enabled in the Comprehend job." ) else: print( "No KMS Key was specified. Encryption will not be enabled in the Comprehend job." ) if response['ContentLength'] < 1: operator_object.update_workflow_status("Complete") operator_object.add_workflow_metadata( comprehend_entity_job_id="Empty input --> empty output.") return operator_object.return_output_object() except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="No valid inputs") raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' dataplane = DataPlane() output_uri_request = dataplane.generate_media_storage_path( asset_id, workflow_id) output_uri = "s3://{bucket}/{key}".format( bucket=output_uri_request["S3Bucket"], key=output_uri_request["S3Key"] + '/comprehend_entities') try: if kms_key_id != '': # If the user specified a KMS key then enable comprehend job encryption. comprehend.start_entities_detection_job( InputDataConfig={ "S3Uri": uri, "InputFormat": "ONE_DOC_PER_FILE" }, OutputDataConfig={ "S3Uri": output_uri, "KmsKeyId": kms_key_id }, DataAccessRoleArn=comprehend_role, VolumeKmsKeyId=kms_key_id, JobName=workflow_id, LanguageCode="en") else: comprehend.start_entities_detection_job( InputDataConfig={ "S3Uri": uri, "InputFormat": "ONE_DOC_PER_FILE" }, OutputDataConfig={"S3Uri": output_uri}, DataAccessRoleArn=comprehend_role, JobName=workflow_id, LanguageCode="en") except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( comprehend_error="Unable to get response from comprehend: {e}". format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: comprehend_job_id = workflow_id operator_object.add_workflow_metadata( comprehend_entity_job_id=comprehend_job_id, entity_output_uri=output_uri) operator_object.update_workflow_status('Executing') return operator_object.return_output_object()
def web_to_vtt(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' try: targetLanguageCodes = operator_object.configuration["TargetLanguageCodes"] workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(CaptionsError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) captions_collection = [] for lang in targetLanguageCodes: captions = [] captionsOperatorName = "WebCaptions_"+lang # response = dataplane.retrieve_asset_metadata(asset_id, operator_name=captionsOperatorName) # #FIXME Dataplane should only return WebCaptions data from this call, but it is returning everything # if "operator" in response and response["operator"] == captionsOperatorName: # captions.append(response["results"]) # while "cursor" in response: # response = dataplane.retrieve_asset_metadata(asset_id, operator_name=captionsOperatorName, cursor=response["cursor"]) # #FIXME Dataplane should only return WebCaptions data from this call, but it is returning everything # if response["operator"] == captionsOperatorName: # captions.append(response["results"]) captions = get_webcaptions_json(operator_object, lang) vtt = 'WEBVTT\n\n' for i in range(len(captions)): caption = captions[i] vtt += formatTimeVTT(float(caption["start"])) + ' --> ' + formatTimeVTT(float(caption["end"])) + '\n' vtt += caption["caption"] + '\n\n' response = dataplane.generate_media_storage_path(asset_id, workflow_id) print(json.dumps(response)) bucket = response["S3Bucket"] key = response["S3Key"]+'Captions_'+lang+'.vtt' s3_object = s3_resource.Object(bucket, key) s3_object.put(Body=vtt) metadata = { "OperatorName": "VTTCaptions_"+lang, "Results": {"S3Bucket": bucket, "S3Key": key}, "WorkflowId": workflow_id, "LanguageCode": lang } captions_collection.append(metadata) data = {} data["CaptionsCollection"] = captions_collection metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, data) if "Status" not in metadata_upload: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(CaptionsError="Unable to store srt captions file {e}".format(e=metadata_upload)) raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload["Status"] == "Success": operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( CaptionsError="Unable to store srt captions file {e}".format(e=metadata_upload)) raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="No valid inputs {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' try: source_lang = operator_object.configuration["SourceLanguageCode"] target_lang = operator_object.configuration["TargetLanguageCode"] except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Language codes are not defined") raise MasExecutionError(operator_object.return_output_object()) try: s3_response = s3.get_object(Bucket=bucket, Key=key) transcribe_metadata = json.loads( s3_response["Body"].read().decode("utf-8")) transcript = transcribe_metadata["results"]["transcripts"][0][ "transcript"] except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Unable to read transcription from S3: {e}".format( e=str(e))) raise MasExecutionError(operator_object.return_output_object()) # If input text is empty then we're done. if len(transcript) < 1: operator_object.update_workflow_status("Complete") return operator_object.return_output_object() # Tell the NLTK data loader to look for files in /tmp/ nltk.data.path.append("/tmp/") # Download NLTK tokenizers to /tmp/ # We use /tmp because that's where AWS Lambda provides write access to the local file system. nltk.download('punkt', download_dir='/tmp/') # Create language tokenizer according to user-specified source language. # Default to English. if source_lang == 'fr': print("Using French dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/french.pickle') elif source_lang == 'de': print("Using German dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/german.pickle') elif source_lang == 're': print("Using Russian dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle') elif source_lang == 'it': print("Using Italian dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle') elif source_lang == 'pt': print("Using Portuguese dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle') elif source_lang == 'es': print("Using Spanish dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') else: print("Using English dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Split input text into a list of sentences sentences = tokenizer.tokenize(transcript) print("Input text length: " + str(len(transcript))) print("Number of sentences: " + str(len(sentences))) translated_text = '' transcript_chunk = '' for sentence in sentences: # Translate can handle 5000 unicode characters but we'll process no # more than 1000 just to be on the safe side. # Even by limiting input text to 3000 characters, we've still seen # translate throttling with a RateExceeded exception. # Reducing input text to 1000 characters seemed to fix this. if (len(sentence) + len(transcript_chunk) < 1000): transcript_chunk = transcript_chunk + ' ' + sentence else: try: print("Translation input text length: " + str(len(transcript_chunk))) translation_chunk = translate_client.translate_text( Text=transcript_chunk, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang) print("Translation output text length: " + str(len(translation_chunk))) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Unable to get response from translate: {e}" .format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) translated_text = translated_text + ' ' + translation_chunk[ "TranslatedText"] transcript_chunk = sentence print("Translating the final chunk of input text...") try: print("Translation input text length: " + str(len(transcript_chunk))) translation_chunk = translate_client.translate_text( Text=transcript_chunk, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang) print("Translation output text length: " + str(len(translation_chunk))) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Unable to get response from translate: {e}".format( e=str(e))) raise MasExecutionError(operator_object.return_output_object()) translated_text = translated_text + ' ' + translation_chunk[ "TranslatedText"] # Put final result into a JSON object because the MIE dataplane requires it to be so. translation_result = {} translation_result["TranslatedText"] = translated_text translation_result["SourceLanguageCode"] = source_lang translation_result["TargetLanguageCode"] = target_lang print("Final translation text length: " + str(len(translated_text))) dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, translation_result) if "Status" not in metadata_upload: operator_object.add_workflow_metadata( TranslateError="Unable to upload metadata for asset: {asset}". format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload['Status'] == 'Success': operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key']) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( TranslateError="Unable to upload metadata for asset: {asset}". format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) # Get MediaConvert job id try: job_id = operator_object.metadata["MediaconvertJobId"] workflow_id = operator_object.workflow_execution_id input_file = operator_object.metadata["MediaconvertInputFile"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) # Get asset id try: asset_id = operator_object.asset_id except KeyError as e: print("No asset_id in this workflow") asset_id = '' # Get mediaconvert endpoint from cache if available if ("MEDIACONVERT_ENDPOINT" in os.environ): mediaconvert_endpoint = os.environ["MEDIACONVERT_ENDPOINT"] customer_mediaconvert = boto3.client("mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) else: try: response = mediaconvert.describe_endpoints() except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: mediaconvert_endpoint = response["Endpoints"][0]["Url"] # Cache the mediaconvert endpoint in order to avoid getting throttled on # the DescribeEndpoints API. os.environ["MEDIACONVERT_ENDPOINT"] = mediaconvert_endpoint customer_mediaconvert = boto3.client("mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) # Get MediaConvert job results try: response = customer_mediaconvert.get_job(Id=job_id) except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError=e, MediaconvertJobId=job_id) raise MasExecutionError(operator_object.return_output_object()) else: if response["Job"]["Status"] == 'IN_PROGRESS' or response["Job"]["Status"] == 'PROGRESSING': operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata(MediaconvertJobId=job_id, MediaconvertInputFile=input_file, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["Job"]["Status"] == 'COMPLETE': input_filename = os.path.splitext(operator_object.metadata["MediaconvertInputFile"].split("/")[-1])[0] # Get Thumbnail object thumbnail_output_uri = response["Job"]["Settings"]["OutputGroups"][0]["OutputGroupSettings"]["FileGroupSettings"]["Destination"] thumbnail_extension = response["Job"]["Settings"]["OutputGroups"][0]["Outputs"][0]["Extension"] thumbnail_modifier = response["Job"]["Settings"]["OutputGroups"][0]["Outputs"][0]["NameModifier"] thumbnail_bucket = thumbnail_output_uri.split("/")[2] thumbnail_folder = "/".join(thumbnail_output_uri.split("/")[3:-1]) thumbnail_key = thumbnail_folder + "/" + input_filename + thumbnail_modifier + "." + thumbnail_extension operator_object.add_media_object("Thumbnail", thumbnail_bucket, thumbnail_key) # Get audio object audio_output_uri = response["Job"]["Settings"]["OutputGroups"][1]["OutputGroupSettings"]["FileGroupSettings"]["Destination"] audio_extension = response["Job"]["Settings"]["OutputGroups"][1]["Outputs"][0]["Extension"] audio_modifier = response["Job"]["Settings"]["OutputGroups"][1]["Outputs"][0]["NameModifier"] audio_bucket = audio_output_uri.split("/")[2] audio_folder = "/".join(audio_output_uri.split("/")[3:-1]) audio_key = audio_folder + "/" + input_filename + audio_modifier + "." + audio_extension operator_object.add_media_object("Audio", audio_bucket, audio_key) operator_object.add_workflow_metadata(MediaconvertJobId=job_id) # Get mp4 proxy encode object proxy_encode_output_uri = response["Job"]["Settings"]["OutputGroups"][2]["OutputGroupSettings"]["FileGroupSettings"]["Destination"] proxy_encode_extension = response["Job"]["Settings"]["OutputGroups"][2]["Outputs"][0]["Extension"] proxy_encode_modifier = response["Job"]["Settings"]["OutputGroups"][2]["Outputs"][0]["NameModifier"] proxy_encode_bucket = proxy_encode_output_uri.split("/")[2] proxy_encode_folder = "/".join(proxy_encode_output_uri.split("/")[3:-1]) proxy_encode_key = proxy_encode_folder + "/" + input_filename + proxy_encode_modifier + "." + proxy_encode_extension operator_object.add_media_object("ProxyEncode", proxy_encode_bucket, proxy_encode_key) # Set workflow status complete operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( MediaconvertError="Unhandled exception, unable to get status from mediaconvert: {response}".format(response=response), MediaconvertJobId=job_id) raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: job_id = operator_object.metadata["MediaconvertJobId"] workflow_id = operator_object.workflow_execution_id input_file = operator_object.metadata["MediaconvertInputFile"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( MediaconvertError="Missing a required metadata key {e}".format( e=e)) raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError as e: print("No asset_id in this workflow") asset_id = '' try: response = mediaconvert.describe_endpoints() except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: mediaconvert_endpoint = response["Endpoints"][0]["Url"] customer_mediaconvert = boto3.client( "mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) try: response = customer_mediaconvert.get_job(Id=job_id) except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError=e, MediaconvertJobId=job_id) raise MasExecutionError(operator_object.return_output_object()) else: if response["Job"]["Status"] == 'IN_PROGRESS' or response["Job"][ "Status"] == 'PROGRESSING': operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( MediaconvertJobId=job_id, MediaconvertInputFile=input_file, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["Job"]["Status"] == 'COMPLETE': # TODO: Store job details as metadata in dataplane # TODO: Get output uri from dataplane output_uri = response["Job"]["Settings"]["OutputGroups"][0][ "OutputGroupSettings"]["FileGroupSettings"]["Destination"] extension = response["Job"]["Settings"]["OutputGroups"][0][ "Outputs"][0]["Extension"] modifier = response["Job"]["Settings"]["OutputGroups"][0][ "Outputs"][0]["NameModifier"] bucket = output_uri.split("/")[2] folder = "/".join(output_uri.split("/")[3:-1]) file_name = os.path.splitext( operator_object.metadata["MediaconvertInputFile"])[0].split( "/")[-1] key = folder + "/" + file_name + modifier + "." + extension operator_object.add_media_object("Audio", bucket, key) operator_object.add_workflow_metadata(MediaconvertJobId=job_id) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( MediaconvertError= "Unhandled exception, unable to get status from mediaconvert: {response}" .format(response=response), MediaconvertJobId=job_id) raise MasExecutionError(operator_object.return_output_object())
def start_polly_webcaptions(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) webcaptions_object = WebCaptions(operator_object) captions_collection = webcaptions_object.GetWebCaptionsCollection() print("INPUT CAPTIONS COLLECTION") print(json.dumps(captions_collection)) for caption in captions_collection: # Always start from WebCaptions data since these are the most recently edited version # Convert WebCaptions to a text only transcript transcript = webcaptions_object.GetTextOnlyTranscript( caption["TargetLanguageCode"]) # If input text is empty then we're done. if len(transcript) < 1: operator_object.update_workflow_status("Complete") return operator_object.return_output_object() # Get language code of the transcript, we should just pass this along in the event later language_code = translate_to_polly_language_code( caption["TargetLanguageCode"]) if language_code == "not supported": caption["PollyStatus"] = "not supported" else: try: # set voice_id based on language response = polly.describe_voices( #Engine='standard'|'neural', LanguageCode=language_code #IncludeAdditionalLanguageCodes=True|False, #NextToken='string' ) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyCollectionError= "Unable to get response from polly describe_voices: {e}". format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: # just take the fisrt voice in the list. Maybe later we can extend to choose voice based on other criteria such # as gender voice_id = response["Voices"][0]["Id"] caption["VoiceId"] = voice_id caption["PollyAudio"] = {} caption["PollyAudio"][ "S3Key"] = 'private/assets/' + operator_object.asset_id + "/workflows/" + operator_object.workflow_execution_id + "/" + "audio_only" + "_" + caption[ "TargetLanguageCode"] caption["PollyAudio"]["S3Bucket"] = caption["TranslationText"][ "S3Bucket"] try: polly_response = polly.start_speech_synthesis_task( OutputFormat='mp3', OutputS3BucketName=caption["PollyAudio"]["S3Bucket"], OutputS3KeyPrefix=caption["PollyAudio"]["S3Key"], Text=transcript, TextType='text', VoiceId=voice_id) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyCollectionError= "Unable to get response from polly: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: polly_job_id = polly_response['SynthesisTask']['TaskId'] caption["PollyTaskId"] = polly_job_id caption["PollyStatus"] = "started" # Polly adds the polly task id to the S3 Key of the output caption["PollyAudio"][ "S3Key"] = 'private/assets/' + operator_object.asset_id + "/workflows/" + operator_object.workflow_execution_id + "/" + "audio_only" + "_" + caption[ "TargetLanguageCode"] + "." + polly_job_id + ".mp3" operator_object.add_workflow_metadata( PollyCollection=captions_collection, WorkflowExecutionId=operator_object.workflow_execution_id, AssetId=operator_object.asset_id) operator_object.update_workflow_status('Executing') return operator_object.return_output_object()
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: task_id = operator_object.metadata["PollyJobId"] workflow_id = operator_object.workflow_execution_id asset_id = operator_object.asset_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: polly_response = polly.get_speech_synthesis_task(TaskId=task_id) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyError="Unable to get response from polly: {e}".format( e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: polly_status = polly_response["SynthesisTask"]["TaskStatus"] print("The status from polly is:\n", polly_status) if polly_status == "inProgress": polly_job_id = polly_response["SynthesisTask"]["TaskId"] operator_object.add_workflow_metadata( PollyJobId=polly_job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) operator_object.update_workflow_status("Executing") return operator_object.return_output_object() elif polly_status == "completed": # TODO: Store job details as metadata in dataplane uri = polly_response["SynthesisTask"]["OutputUri"] file = uri.split("/")[5] folder = uri.split("/")[4] bucket = uri.split("/")[3] key = folder + "/" + file operator_object.add_workflow_metadata(PollyJobId=task_id) operator_object.add_media_object("Audio", bucket, key) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() elif polly_status == "scheduled": operator_object.add_workflow_metadata( PollyJobId=task_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) operator_object.update_workflow_status("Executing") return operator_object.return_output_object() elif polly_status == "failed": operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyError="Polly returned as failed: {e}".format(e=str( polly_response["SynthesisTask"]["TaskStatusReason"]))) raise MasExecutionError(operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyError="Polly returned as failed: {e}".format(e=str( polly_response["SynthesisTask"]["TaskStatusReason"]))) raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: workflow_id = str(operator_object.workflow_execution_id) asset_id = operator_object.asset_id bucket = operator_object.input["Media"]["Video"]["S3Bucket"] key = operator_object.input["Media"]["Video"]["S3Key"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(VideoTranscodingError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) file_input = "s3://" + bucket + "/" + key hls_destination = "s3://" + bucket + "/private/assets/" + asset_id + "/hls/playlist" proxy_destination = "s3://" + bucket + "/private/assets/" + asset_id + "/proxy/" + asset_id audio_destination = "s3://" + bucket + "/private/assets/" + asset_id + "/audio/" + asset_id # Get mediaconvert endpoint from cache if available if ("MEDIACONVERT_ENDPOINT" in os.environ): mediaconvert_endpoint = os.environ["MEDIACONVERT_ENDPOINT"] customer_mediaconvert = boto3.client("mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) else: try: response = mediaconvert.describe_endpoints() except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(VideoTranscodingError=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: mediaconvert_endpoint = response["Endpoints"][0]["Url"] # Cache the mediaconvert endpoint in order to avoid getting throttled on # the DescribeEndpoints API. os.environ["MEDIACONVERT_ENDPOINT"] = mediaconvert_endpoint customer_mediaconvert = boto3.client("mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) try: response = customer_mediaconvert.create_job( Role=mediaconvert_role, Settings={ "OutputGroups": [ { "Name": "Apple HLS", "Outputs": [ { "Preset": "System-Avc_16x9_1080p_29_97fps_8500kbps", "NameModifier": "_hls" } ], "OutputGroupSettings": { "Type": "HLS_GROUP_SETTINGS", "HlsGroupSettings": { "ManifestDurationFormat": "INTEGER", "SegmentLength": 1, "TimedMetadataId3Period": 10, "CaptionLanguageSetting": "OMIT", "TimedMetadataId3Frame": "PRIV", "CodecSpecification": "RFC_4281", "OutputSelection": "MANIFESTS_AND_SEGMENTS", "ProgramDateTimePeriod": 600, "MinSegmentLength": 0, "MinFinalSegmentLength": 0, "DirectoryStructure": "SINGLE_DIRECTORY", "ProgramDateTime": "EXCLUDE", "SegmentControl": "SEGMENTED_FILES", "ManifestCompression": "NONE", "ClientCache": "ENABLED", "StreamInfResolution": "INCLUDE", "Destination": hls_destination } } }, { "CustomName": "Proxy", "Name": "File Group", "Outputs": [ { "VideoDescription": { "ScalingBehavior": "DEFAULT", "TimecodeInsertion": "DISABLED", "AntiAlias": "ENABLED", "Sharpness": 50, "CodecSettings": { "Codec": "H_264", "H264Settings": { "InterlaceMode": "PROGRESSIVE", "NumberReferenceFrames": 3, "Syntax": "DEFAULT", "Softness": 0, "GopClosedCadence": 1, "GopSize": 90, "Slices": 1, "GopBReference": "DISABLED", "SlowPal": "DISABLED", "SpatialAdaptiveQuantization": "ENABLED", "TemporalAdaptiveQuantization": "ENABLED", "FlickerAdaptiveQuantization": "DISABLED", "EntropyEncoding": "CABAC", "Bitrate": 5000000, "FramerateControl": "SPECIFIED", "RateControlMode": "CBR", "CodecProfile": "MAIN", "Telecine": "NONE", "MinIInterval": 0, "AdaptiveQuantization": "HIGH", "CodecLevel": "AUTO", "FieldEncoding": "PAFF", "SceneChangeDetect": "ENABLED", "QualityTuningLevel": "SINGLE_PASS", "FramerateConversionAlgorithm": "DUPLICATE_DROP", "UnregisteredSeiTimecode": "DISABLED", "GopSizeUnits": "FRAMES", "ParControl": "SPECIFIED", "NumberBFramesBetweenReferenceFrames": 2, "RepeatPps": "DISABLED", "FramerateNumerator": 30, "FramerateDenominator": 1, "ParNumerator": 1, "ParDenominator": 1 } }, "AfdSignaling": "NONE", "DropFrameTimecode": "ENABLED", "RespondToAfd": "NONE", "ColorMetadata": "INSERT" }, "AudioDescriptions": [ { "AudioTypeControl": "FOLLOW_INPUT", "CodecSettings": { "Codec": "AAC", "AacSettings": { "AudioDescriptionBroadcasterMix": "NORMAL", "RateControlMode": "CBR", "CodecProfile": "LC", "CodingMode": "CODING_MODE_2_0", "RawFormat": "NONE", "SampleRate": 48000, "Specification": "MPEG4", "Bitrate": 64000 } }, "LanguageCodeControl": "FOLLOW_INPUT", "AudioSourceName": "Audio Selector 1" } ], "ContainerSettings": { "Container": "MP4", "Mp4Settings": { "CslgAtom": "INCLUDE", "FreeSpaceBox": "EXCLUDE", "MoovPlacement": "PROGRESSIVE_DOWNLOAD" } }, "Extension": "mp4", "NameModifier": "_proxy" } ], "OutputGroupSettings": { "Type": "FILE_GROUP_SETTINGS", "FileGroupSettings": { "Destination": proxy_destination } } }, { "CustomName": "Audio", "Name": "File Group", "Outputs": [ { "ContainerSettings": { "Container": "MP4", "Mp4Settings": { "CslgAtom": "INCLUDE", "CttsVersion": 0, "FreeSpaceBox": "EXCLUDE", "MoovPlacement": "PROGRESSIVE_DOWNLOAD" } }, "AudioDescriptions": [ { "AudioTypeControl": "FOLLOW_INPUT", "AudioSourceName": "Audio Selector 1", "AudioNormalizationSettings": { "Algorithm": "ITU_BS_1770_2", "AlgorithmControl": "MEASURE_ONLY", "LoudnessLogging": "LOG", "PeakCalculation": "NONE" }, "CodecSettings": { "Codec": "AAC", "AacSettings": { "AudioDescriptionBroadcasterMix": "NORMAL", "Bitrate": 96000, "RateControlMode": "CBR", "CodecProfile": "LC", "CodingMode": "CODING_MODE_2_0", "RawFormat": "NONE", "SampleRate": 48000, "Specification": "MPEG4" } }, "LanguageCodeControl": "FOLLOW_INPUT" } ], "Extension": "mp4", "NameModifier": "_audio" } ], "OutputGroupSettings": { "Type": "FILE_GROUP_SETTINGS", "FileGroupSettings": { "Destination": audio_destination } } } ], "Inputs": [{ "AudioSelectors": { "Audio Selector 1": { "Offset": 0, "DefaultSelection": "DEFAULT", "ProgramSelection": 1 } }, "VideoSelector": { "ColorSpace": "FOLLOW", "Rotate": "DEGREE_0", "AlphaBehavior": "DISCARD" }, "FilterEnable": "AUTO", "PsiControl": "USE_PSI", "FilterStrength": 0, "DeblockFilter": "DISABLED", "DenoiseFilter": "DISABLED", "TimecodeSource": "EMBEDDED", "FileInput": file_input }] } ) # TODO: Add support for boto client error handling except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(VideoTranscodingError=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: job_id = response['Job']['Id'] operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata(VideoTranscodingJobId=job_id, VideoTranscodingInputFile=file_input, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object()
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) # Get media metadata from input event try: workflow_id = operator_object.workflow_execution_id asset_id = operator_object.asset_id loudness_bucket = operator_object.input["Media"]["Loudness"]["S3Bucket"] loudness_key = operator_object.input["Media"]["Loudness"]["S3Key"] except Exception as exception: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( SlotDetectionError="Missing a required metadata key {e}".format(e=exception)) raise MasExecutionError(operator_object.return_output_object()) # Get asset metadata from dataplane try: asset_metadata = __get_asset_metadata(asset_id) except Exception as exception: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( SlotDetectionError="Unable to retrieve metadata for asset {}: {}".format(asset_id, exception)) raise MasExecutionError(operator_object.return_output_object()) try: # Get detected reasons' timestamps from media and asset metadata silences = detect_silences(loudness_bucket, loudness_key) black_frames, end_credits = detect_technical_cues(asset_metadata) shots = detect_shots(asset_metadata) reasons_timestamps = { "Silence": silences, "BlackFrame": black_frames, "ShotChange": shots, "EndCredits": end_credits } media_info = asset_metadata["shotDetection"]["VideoMetadata"][0] # Create slots from reasons' timestamps print("reasons_timestamps: {}".format(reasons_timestamps)) slots = [] for reason in reasons_timestamps: for timestamp in reasons_timestamps[reason]: slots.append({ "Timestamp": float(timestamp), "Score": 1.0, "Reasons": [reason] }) print("slots: {}".format(slots)) # Consolidate slots and calculate scores slots = calculate_scores(slots, media_info, asset_metadata) print("scored_slots: {}".format(slots)) except Exception as exception: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(SlotDetectionError=str(exception)) raise MasExecutionError(operator_object.return_output_object()) operator_object.add_workflow_metadata( AssetId=asset_id, WorkflowExecutionId=workflow_id) operator_object.update_workflow_status("Complete") metadata_upload = dataplane.store_asset_metadata( asset_id=asset_id, operator_name=operator_object.name, workflow_id=workflow_id, results={"slots": slots} ) print("metadata_upload: {}".format(metadata_upload)) if metadata_upload["Status"] == "Success": print("Uploaded metadata for asset: {asset}".format(asset=asset_id)) elif metadata_upload["Status"] == "Failed": operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( SlotDetectionError="Unable to upload metadata for asset {}: {}".format(asset_id, metadata_upload)) raise MasExecutionError(operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( SlotDetectionError="Unable to upload metadata for asset {}: {}".format(asset_id, metadata_upload)) raise MasExecutionError(operator_object.return_output_object()) return operator_object.return_output_object()
def check_translate_webcaptions(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) webcaptions_object = WebCaptions(operator_object) try: translate_jobs = operator_object.metadata[ "TextTranslateJobPropertiesList"] workflow_id = operator_object.workflow_execution_id asset_id = operator_object.asset_id transcript_storage_path = dataplane.generate_media_storage_path( asset_id, workflow_id) bucket = transcript_storage_path['S3Bucket'] translation_output_path = transcript_storage_path[ 'S3Key'] + "webcaptions_translate_output/" except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) # Check the status of each job # - IF ANY job has an error, we fail the workflow and return from the loop # - IF ANY job is still running, the workflow is still Executing # - If ALL jobs are complete, we reach the end of the loop and the workflow is complete for job in translate_jobs: try: job_id = job["JobId"] job_status_list = [] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Missing a required metadata key {e}".format( e=e)) raise MasExecutionError(operator_object.return_output_object()) try: response = translate_client.describe_text_translation_job( JobId=job_id) print(response) job_status = { "JobId": job_id, "Status": response["TextTranslationJobProperties"]["JobStatus"] } except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError=str(e), TranslateJobId=job_id) raise MasExecutionError(operator_object.return_output_object()) else: if response["TextTranslationJobProperties"]["JobStatus"] in [ "IN_PROGRESS", "SUBMITTED" ]: operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( TextTranslateJobStatusList=job_status_list, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["TextTranslationJobProperties"]["JobStatus"] in [ "FAILED", "COMPLETED_WITH_ERROR", "STOP_REQUESTED", "STOPPED" ]: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TextTranslateJobStatusList=job_status_list, AssetId=asset_id, WorkflowExecutionId=workflow_id) raise MasExecutionError(operator_object.return_output_object()) elif response["TextTranslationJobProperties"][ "JobStatus"] == "COMPLETED": print("{} is complete".format(job_id)) operator_object.add_workflow_metadata( TextTranslateJobStatusList=job_status_list, AssetId=asset_id, WorkflowExecutionId=workflow_id) # If we made it here, then all the translate jobs are complete. # Convert the translations back to WebCaptions and write them out # to the dataplane translation_storage_path = dataplane.generate_media_storage_path( asset_id, workflow_id) bucket = translation_storage_path['S3Bucket'] translation_path = translation_storage_path['S3Key'] webcaptions_collection = [] for job in translate_jobs: try: print("Save translation for job {}".format(job["JobId"])) translateJobDescription = translate_client.describe_text_translation_job( JobId=job["JobId"]) translateJobS3Uri = translateJobDescription[ "TextTranslationJobProperties"]["OutputDataConfig"]["S3Uri"] translateJobUrl = urlparse(translateJobS3Uri, allow_fragments=False) translateJobLanguageCode = translateJobDescription[ "TextTranslationJobProperties"]["TargetLanguageCodes"][0] translateJobS3Location = { "Uri": translateJobS3Uri, "Bucket": translateJobUrl.netloc, "Key": translateJobUrl.path.strip("/") } # use input web captions to convert translation output to web captions format for outputS3ObjectKey in map( lambda s: s.key, s3_resource.Bucket( translateJobS3Location["Bucket"]).objects.filter( Prefix=translateJobS3Location["Key"] + "/", Delimiter="/")): print("Save translation for each output of job {} output {}". format(job["JobId"], outputS3ObjectKey)) outputFilename = ntpath.basename(outputS3ObjectKey) translateOutput = s3_resource.Object( translateJobS3Location["Bucket"], outputS3ObjectKey).get()["Body"].read().decode("utf-8") #inputWebCaptions = get_webcaptions(operator_object, translateJobDescription["TextTranslationJobProperties"]["SourceLanguageCode"]) inputWebCaptions = webcaptions_object.GetWebCaptions( translateJobDescription["TextTranslationJobProperties"] ["SourceLanguageCode"]) outputWebCaptions = webcaptions_object.DelimitedToWebCaptions( inputWebCaptions, translateOutput, "<123>", 15) print(outputS3ObjectKey) (targetLanguageCode, basename, ext) = outputFilename.split(".") #put_webcaptions(operator_object, outputWebCaptions, targetLanguageCode) operator_metadata = webcaptions_object.PutWebCaptions( outputWebCaptions, targetLanguageCode) # Save a copy of the translation text without delimiters translation_text = translateOutput.replace("<123>", "") translation_text_key = translation_path + "translation" + "_" + targetLanguageCode + ".txt" s3_object = s3_resource.Object(bucket, translation_text_key) s3_object.put(Body=translation_text) metadata = { "OperatorName": "TranslateWebCaptions_" + translateJobLanguageCode, "TranslationText": { "S3Bucket": bucket, "S3Key": translation_text_key }, "WebCaptions": operator_metadata, "WorkflowId": workflow_id, "TargetLanguageCode": translateJobLanguageCode } print(json.dumps(metadata)) webcaptions_collection.append(metadata) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( CaptionsError= "Unable to construct path to translate output in S3: {e}". format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) data = {} data["CaptionsCollection"] = webcaptions_collection webcaptions_object.PutMediaCollection(operator_object.name, data) return operator_object.return_output_object()
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) # If Transcribe wasn't run due to silent audio, then we're done if "Mediainfo_num_audio_tracks" in event["Input"]["MetaData"] and event[ "Input"]["MetaData"]["Mediainfo_num_audio_tracks"] == "0": operator_object.update_workflow_status("Complete") return operator_object.return_output_object() try: job_id = operator_object.metadata["TranscribeJobId"] workflow_id = operator_object.workflow_execution_id asset_id = operator_object.asset_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: response = transcribe.get_transcription_job( TranscriptionJobName=job_id) print(response) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranscribeError=str(e), TranscribeJobId=job_id) raise MasExecutionError(operator_object.return_output_object()) else: if response["TranscriptionJob"][ "TranscriptionJobStatus"] == "IN_PROGRESS": operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( TranscribeJobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "FAILED": operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeJobId=job_id, TranscribeError=str( response["TranscriptionJob"]["FailureReason"])) raise MasExecutionError(operator_object.return_output_object()) elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "COMPLETED": transcribe_uri = response["TranscriptionJob"]["Transcript"][ "TranscriptFileUri"] http = urllib3.PoolManager() transcription = http.request('GET', transcribe_uri) transcription_data = transcription.data.decode("utf-8") transcription_json = json.loads(transcription_data) text_only_transcript = '' for transcripts in transcription_json["results"]["transcripts"]: transcript = transcripts["transcript"] text_only_transcript = text_only_transcript.join(transcript) print(text_only_transcript) dataplane = DataPlane() s3 = boto3.client('s3') transcript_storage_path = dataplane.generate_media_storage_path( asset_id, workflow_id) key = transcript_storage_path['S3Key'] + "transcript.txt" bucket = transcript_storage_path['S3Bucket'] s3.put_object(Bucket=bucket, Key=key, Body=text_only_transcript) transcription_json["TextTranscriptUri"] = { "S3Bucket": bucket, "S3Key": key } metadata_upload = dataplane.store_asset_metadata( asset_id, operator_object.name, workflow_id, transcription_json) if "Status" not in metadata_upload: operator_object.add_workflow_metadata( TranscribeError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id), TranscribeJobId=job_id) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload['Status'] == 'Success': operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key']) operator_object.add_workflow_metadata( TranscribeJobId=job_id) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( TranscribeError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id), TranscribeJobId=job_id) operator_object.update_workflow_status("Error") raise MasExecutionError( operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="Unable to determine status") raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: workflow_id = str(operator_object.workflow_execution_id) bucket = operator_object.input["Media"]["Video"]["S3Bucket"] key = operator_object.input["Media"]["Video"]["S3Key"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( ThumbnailError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) # Adding in exception block for now since we aren't guaranteed an asset id will be present, should remove later try: asset_id = operator_object.asset_id except KeyError as e: print("No asset id passed in with this workflow", e) asset_id = '' file_input = "s3://" + bucket + "/" + key audio_destination = "s3://" + bucket + "/" + 'private/assets/' + asset_id + "/workflows/" + workflow_id + "/" thumbnail_destination = "s3://" + bucket + "/" + 'private/assets/' + asset_id + "/" proxy_destination = "s3://" + bucket + "/" + 'private/assets/' + asset_id + "/" # Get user-defined location for generic data file if "ThumbnailPosition" in operator_object.configuration: thumbnail_position = int( operator_object.configuration["ThumbnailPosition"]) else: thumbnail_position = 7 # Get mediaconvert endpoint from cache if available if ("MEDIACONVERT_ENDPOINT" in os.environ): mediaconvert_endpoint = os.environ["MEDIACONVERT_ENDPOINT"] customer_mediaconvert = boto3.client( "mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) else: try: response = mediaconvert.describe_endpoints() except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(ThumbnailError=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: mediaconvert_endpoint = response["Endpoints"][0]["Url"] # Cache the mediaconvert endpoint in order to avoid getting throttled on # the DescribeEndpoints API. os.environ["MEDIACONVERT_ENDPOINT"] = mediaconvert_endpoint customer_mediaconvert = boto3.client( "mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) try: response = customer_mediaconvert.create_job( Role=mediaconvert_role, Settings={ "OutputGroups": [{ "CustomName": "thumbnail", "Name": "File Group", "Outputs": [{ "ContainerSettings": { "Container": "RAW" }, "VideoDescription": { "ScalingBehavior": "DEFAULT", "TimecodeInsertion": "DISABLED", "AntiAlias": "ENABLED", "Sharpness": 50, "CodecSettings": { "Codec": "FRAME_CAPTURE", "FrameCaptureSettings": { "FramerateNumerator": 1, "FramerateDenominator": thumbnail_position, "MaxCaptures": 2, "Quality": 80 } }, "DropFrameTimecode": "ENABLED", "ColorMetadata": "INSERT" }, "Extension": "jpg", "NameModifier": "_thumbnail" }], "OutputGroupSettings": { "Type": "FILE_GROUP_SETTINGS", "FileGroupSettings": { "Destination": thumbnail_destination } } }, { "Name": "File Group", "Outputs": [{ "ContainerSettings": { "Container": "MP4", "Mp4Settings": { "CslgAtom": "INCLUDE", "FreeSpaceBox": "EXCLUDE", "MoovPlacement": "PROGRESSIVE_DOWNLOAD" } }, "AudioDescriptions": [{ "AudioTypeControl": "FOLLOW_INPUT", "AudioSourceName": "Audio Selector 1", "CodecSettings": { "Codec": "AAC", "AacSettings": { "AudioDescriptionBroadcasterMix": "NORMAL", "Bitrate": 96000, "RateControlMode": "CBR", "CodecProfile": "LC", "CodingMode": "CODING_MODE_2_0", "RawFormat": "NONE", "SampleRate": 48000, "Specification": "MPEG4" } }, "LanguageCodeControl": "FOLLOW_INPUT" }], "Extension": "mp4", "NameModifier": "_audio" }], "OutputGroupSettings": { "Type": "FILE_GROUP_SETTINGS", "FileGroupSettings": { "Destination": audio_destination } } }, { "CustomName": "proxy", "Name": "File Group", "Outputs": [{ "VideoDescription": { "ScalingBehavior": "DEFAULT", "TimecodeInsertion": "DISABLED", "AntiAlias": "ENABLED", "Sharpness": 50, "CodecSettings": { "Codec": "H_264", "H264Settings": { "InterlaceMode": "PROGRESSIVE", "NumberReferenceFrames": 3, "Syntax": "DEFAULT", "Softness": 0, "GopClosedCadence": 1, "GopSize": 90, "Slices": 1, "GopBReference": "DISABLED", "SlowPal": "DISABLED", "SpatialAdaptiveQuantization": "ENABLED", "TemporalAdaptiveQuantization": "ENABLED", "FlickerAdaptiveQuantization": "DISABLED", "EntropyEncoding": "CABAC", "Bitrate": 5000000, "FramerateControl": "SPECIFIED", "RateControlMode": "CBR", "CodecProfile": "MAIN", "Telecine": "NONE", "MinIInterval": 0, "AdaptiveQuantization": "HIGH", "CodecLevel": "AUTO", "FieldEncoding": "PAFF", "SceneChangeDetect": "ENABLED", "QualityTuningLevel": "SINGLE_PASS", "FramerateConversionAlgorithm": "DUPLICATE_DROP", "UnregisteredSeiTimecode": "DISABLED", "GopSizeUnits": "FRAMES", "ParControl": "SPECIFIED", "NumberBFramesBetweenReferenceFrames": 2, "RepeatPps": "DISABLED", "FramerateNumerator": 30, "FramerateDenominator": 1, "ParNumerator": 1, "ParDenominator": 1 } }, "AfdSignaling": "NONE", "DropFrameTimecode": "ENABLED", "RespondToAfd": "NONE", "ColorMetadata": "INSERT" }, "AudioDescriptions": [{ "AudioTypeControl": "FOLLOW_INPUT", "CodecSettings": { "Codec": "AAC", "AacSettings": { "AudioDescriptionBroadcasterMix": "NORMAL", "RateControlMode": "CBR", "CodecProfile": "LC", "CodingMode": "CODING_MODE_2_0", "RawFormat": "NONE", "SampleRate": 48000, "Specification": "MPEG4", "Bitrate": 64000 } }, "LanguageCodeControl": "FOLLOW_INPUT", "AudioSourceName": "Audio Selector 1" }], "ContainerSettings": { "Container": "MP4", "Mp4Settings": { "CslgAtom": "INCLUDE", "FreeSpaceBox": "EXCLUDE", "MoovPlacement": "PROGRESSIVE_DOWNLOAD" } }, "Extension": "mp4", "NameModifier": "_proxy" }], "OutputGroupSettings": { "Type": "FILE_GROUP_SETTINGS", "FileGroupSettings": { "Destination": proxy_destination } } }], "Inputs": [{ "AudioSelectors": { "Audio Selector 1": { "Offset": 0, "DefaultSelection": "DEFAULT", "ProgramSelection": 1 } }, "VideoSelector": { "ColorSpace": "FOLLOW" }, "FileInput": file_input }] }) # TODO: Add support for boto client error handling except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(ThumbnailError=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: job_id = response['Job']['Id'] operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata(MediaconvertJobId=job_id, MediaconvertInputFile=file_input, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object()
def lambda_handler(event, context): print("We got this event:\n", event) valid_types = ["mp3", "mp4", "wav", "flac"] optional_settings = {} operator_object = MediaInsightsOperationHelper(event) workflow_id = str(operator_object.workflow_execution_id) job_id = "transcribe" + "-" + workflow_id # Adding in exception block for now since we aren't guaranteed an asset id will be present, should remove later try: asset_id = operator_object.asset_id except KeyError as e: print("No asset id passed in with this workflow", e) asset_id = '' try: bucket = operator_object.input["Media"]["Audio"]["S3Bucket"] key = operator_object.input["Media"]["Audio"]["S3Key"] file_type = key.split('.')[-1] # TODO: Do we want to add support for video? except KeyError: bucket = operator_object.input["Media"]["Video"]["S3Bucket"] key = operator_object.input["Media"]["Video"]["S3Key"] file_type = key.split('.')[-1] except Exception: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="No valid inputs") raise MasExecutionError(operator_object.return_output_object()) if file_type not in valid_types: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="Not a valid file type") raise MasExecutionError(operator_object.return_output_object()) try: custom_vocab = operator_object.configuration["VocabularyName"] optional_settings["VocabularyName"] = custom_vocab except KeyError: # No custom vocab pass try: language_code = operator_object.configuration["TranscribeLanguage"] except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="No language code defined") raise MasExecutionError(operator_object.return_output_object()) media_file = 'https://s3.' + region + '.amazonaws.com/' + bucket + '/' + key # If mediainfo data is available then use it to avoid transcribing silent videos. if "Mediainfo_num_audio_tracks" in event["Input"]["MetaData"]: num_audio_tracks = event["Input"]["MetaData"][ "Mediainfo_num_audio_tracks"] # Check to see if audio tracks were detected by mediainfo if num_audio_tracks == "0": # If there is no input audio then we're done. operator_object.update_workflow_status("Complete") return operator_object.return_output_object() try: response = transcribe.start_transcription_job( TranscriptionJobName=job_id, LanguageCode=language_code, Media={"MediaFileUri": media_file}, MediaFormat=file_type, Settings=optional_settings) print(response) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(transcribe_error=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: if response["TranscriptionJob"][ "TranscriptionJobStatus"] == "IN_PROGRESS": operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( TranscribeJobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "FAILED": operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeJobId=job_id, TranscribeError=str( response["TranscriptionJob"]["FailureReason"])) raise MasExecutionError(operator_object.return_output_object()) elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "COMPLETE": operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( TranscribeJobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeJobId=job_id, TranscribeError="Unhandled error for this job: {job_id}". format(job_id=job_id)) raise MasExecutionError(operator_object.return_output_object())
def web_captions(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(CaptionsError="No valid inputs {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: lang = operator_object.configuration["SourceLanguageCode"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(CaptionsError="No language codes {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(CaptionsError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' try: s3_response = s3.get_object(Bucket=bucket, Key=key) transcribe_metadata = json.loads(s3_response["Body"].read().decode("utf-8")) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(CaptionsError="Unable to read transcription from S3: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) endTime = 0.0 maxLength = 50 wordCount = 0 maxWords = 12 maxSilence = 1.5 captions = [] caption = None for item in transcribe_metadata["results"]["items"]: isPunctuation = item["type"] == "punctuation" if caption is None: # Start of a line with punctuation, just skip it if isPunctuation: continue # Create a new caption line caption = { "start": float(item["start_time"]), "caption": "", "wordConfidence": [] } if not isPunctuation: startTime = float(item["start_time"]) # Check to see if there has been a long silence # between the last recorded word and start a new # caption if this is the case, ending the last time # as this one starts. if (len(caption["caption"]) > 0) and ((endTime + maxSilence) < startTime): caption["end"] = startTime captions.append(caption) caption = { "start": float(startTime), "caption": "", "wordConfidence": [] } wordCount = 0 endTime = float(item["end_time"]) requiresSpace = (not isPunctuation) and (len(caption["caption"]) > 0) if requiresSpace: caption["caption"] += " " # Process tweaks text = item["alternatives"][0]["content"] confidence = item["alternatives"][0]["confidence"] textLower = text.lower() caption["caption"] += text # Track raw word confidence if not isPunctuation: caption["wordConfidence"].append( { "w": textLower, "c": float(confidence) } ) # Count words wordCount += 1 # If we have reached a good amount of text finalize the caption if (wordCount >= maxWords) or (len(caption["caption"]) >= maxLength): caption["end"] = endTime captions.append(caption) wordCount = 0 caption = None # Close the last caption if required if caption is not None: caption["end"] = endTime captions.append(caption) webcaptions_name = "WebCaptions"+"_"+lang i=0 for asset in captions: i=i+1 if i != len(captions): metadata_upload = dataplane.store_asset_metadata(asset_id=asset_id, operator_name=webcaptions_name, workflow_id=workflow_id, results=asset, paginate=True, end=False) if "Status" not in metadata_upload: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( CaptionsError="Unable to store web captions {e}".format(e=metadata_upload)) raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload["Status"] == "Success": pass else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( CaptionsError="Unable to store web captions {e}".format(e=metadata_upload)) raise MasExecutionError(operator_object.return_output_object()) else: metadata_upload = dataplane.store_asset_metadata(asset_id=asset_id, operator_name=webcaptions_name, workflow_id=workflow_id, results=asset, paginate=True, end=True) if "Status" not in metadata_upload: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( CaptionsError="Unable to store web captions {e}".format(e=metadata_upload)) raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload["Status"] == "Success": response_json = metadata_upload operator_object.add_workflow_metadata(WebCaptionsS3Bucket=response_json['Bucket'], WebCaptionsS3Key=response_json['Key']) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( CaptionsError="Unable to store web captions {e}".format(e=metadata_upload)) raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(PollyError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id passed along with this workflow') asset_id = '' try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(PollyError="No valid inputs") raise MasExecutionError(operator_object.return_output_object()) try: s3_response = s3.get_object(Bucket=bucket, Key=key) translate_metadata = json.loads(s3_response["Body"].read().decode("utf-8")) translation = translate_metadata["TranslatedText"] except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(PollyError="Unable to read translation from S3: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) # If input text is empty then we're done. if len(translation) < 1: operator_object.update_workflow_status("Complete") return operator_object.return_output_object() voices = {'en': 'Kendra', 'ru': 'Maxim', 'es': 'Lucia', 'fr': 'Mathieu'} # Get language code of the translation, we should just pass this along in the event later try: comprehend = boto3.client('comprehend') language = comprehend.detect_dominant_language( Text=translation ) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(PollyError="Unable to determine the language with comprehend: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: language_code = language['Languages'][0]['LanguageCode'] if language_code not in voices: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(PollyError="The only supported languages are: {e}".format(e=voices.keys())) raise MasExecutionError(operator_object.return_output_object()) else: voice_id = voices[language_code] print("Translation received from S3:\n", translation) output_key = '/private/assets/' + asset_id + "/workflows/" + workflow_id + "/" + "translation" try: polly_response = polly.start_speech_synthesis_task( OutputFormat='mp3', OutputS3BucketName=bucket, OutputS3KeyPrefix=output_key, Text=translation, TextType='text', VoiceId=voice_id ) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(PollyError="Unable to get response from polly: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: polly_job_id = polly_response['SynthesisTask']['TaskId'] operator_object.add_workflow_metadata(PollyJobId=polly_job_id, WorkflowExecutionId=workflow_id, AssetId=asset_id) operator_object.update_workflow_status('Executing') return operator_object.return_output_object()
def lambda_handler(event, context): print("We got this event:\n", event) valid_types = ["mp3", "mp4", "wav", "flac"] identify_language = False transcribe_job_config = {} optional_settings = {} model_settings = {} job_execution_settings = {} content_redaction_settings = {} identify_language = False language_options = [] operator_object = MediaInsightsOperationHelper(event) workflow_id = str(event["WorkflowExecutionId"]) asset_id = event['AssetId'] job_id = "transcribe" + "-" + workflow_id try: if "ProxyEncode" in event["Input"]["Media"]: bucket = event["Input"]["Media"]["ProxyEncode"]["S3Bucket"] key = event["Input"]["Media"]["ProxyEncode"]["S3Key"] elif "Video" in event["Input"]["Media"]: bucket = event["Input"]["Media"]["Video"]["S3Bucket"] key = event["Input"]["Media"]["Video"]["S3Key"] elif "Audio" in event["Input"]["Media"]: bucket = event["Input"]["Media"]["Audio"]["S3Bucket"] key = event["Input"]["Media"]["Audio"]["S3Key"] file_type = key.split('.')[-1] except Exception: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="No valid inputs") raise MasExecutionError(operator_object.return_output_object()) if file_type not in valid_types: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="Not a valid file type") raise MasExecutionError(operator_object.return_output_object()) try: language_code = operator_object.configuration["TranscribeLanguage"] custom_vocab = operator_object.configuration["VocabularyName"] optional_settings["VocabularyName"] = custom_vocab except KeyError: # No custom vocab pass try: if "TranscribeLanguage" in operator_object.configuration: language_code = operator_object.configuration["TranscribeLanguage"] if language_code == 'auto': identify_language = True else: identify_language = True except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="No language code defined") raise MasExecutionError(operator_object.return_output_object()) media_file = 'https://s3.' + region + '.amazonaws.com/' + bucket + '/' + key # Read optional transcription job settings: if "VocabularyName" in operator_object.configuration: option_value = operator_object.configuration["VocabularyName"] optional_settings["VocabularyName"] = option_value if "ShowSpeakerLabels" in operator_object.configuration: option_value = operator_object.configuration["ShowSpeakerLabels"] optional_settings["ShowSpeakerLabels"] = option_value if "MaxSpeakerLabels" in operator_object.configuration: option_value = operator_object.configuration["MaxSpeakerLabels"] optional_settings["MaxSpeakerLabels"] = option_value if "ChannelIdentification" in operator_object.configuration: option_value = operator_object.configuration["ChannelIdentification"] optional_settings["ChannelIdentification"] = option_value if "MaxAlternatives" in operator_object.configuration: option_value = operator_object.configuration["MaxAlternatives"] optional_settings["MaxAlternatives"] = option_value if "VocabularyFilterName" in operator_object.configuration: option_value = operator_object.configuration["VocabularyFilterName"] optional_settings["VocabularyFilterName"] = option_value if "VocabularyFilterMethod" in operator_object.configuration: option_value = operator_object.configuration["VocabularyFilterMethod"] optional_settings["VocabularyFilterMethod"] = option_value if "LanguageModelName" in operator_object.configuration: option_value = operator_object.configuration["LanguageModelName"] model_settings["LanguageModelName"] = option_value if "AllowDeferredExecution" in operator_object.configuration: option_value = operator_object.configuration["AllowDeferredExecution"] job_execution_settings["AllowDeferredExecution"] = option_value if "DataAccessRoleArn" in operator_object.configuration: option_value = operator_object.configuration["DataAccessRoleArn"] job_execution_settings["DataAccessRoleArn"] = option_value if "RedactionType" in operator_object.configuration: option_value = operator_object.configuration["RedactionType"] content_redaction_settings["RedactionType"] = option_value if "RedactionOutput" in operator_object.configuration: option_value = operator_object.configuration["RedactionOutput"] content_redaction_settings["RedactionOutput"] = option_value if "IdentifyLanguage" in operator_object.configuration: option_value = operator_object.configuration["IdentifyLanguage"] identify_language = option_value if "LanguageOptions" in operator_object.configuration: option_value = operator_object.configuration["LanguageOptions"] language_options = option_value # Combine all the defined transcription job settings into a single dict: transcribe_job_config["TranscriptionJobName"] = job_id transcribe_job_config["Media"] = {"MediaFileUri": media_file} transcribe_job_config["MediaFormat"] = file_type transcribe_job_config["LanguageCode"] = language_code transcribe_job_config["IdentifyLanguage"] = identify_language if len(optional_settings) > 0: transcribe_job_config["Settings"] = optional_settings if len(model_settings) > 0: transcribe_job_config["ModelSettings"] = model_settings if len(job_execution_settings) > 0: transcribe_job_config["JobExecutionSettings"] = job_execution_settings if len(content_redaction_settings) > 0: transcribe_job_config["ContentRedaction"] = content_redaction_settings if len(language_options) > 0: transcribe_job_config["LanguageOptions"] = language_options # If mediainfo data is available then use it to avoid transcribing silent videos. if "Mediainfo_num_audio_tracks" in event["Input"]["MetaData"]: num_audio_tracks = event["Input"]["MetaData"][ "Mediainfo_num_audio_tracks"] # Check to see if audio tracks were detected by mediainfo if num_audio_tracks == "0": # If there is no input audio then we're done. operator_object.update_workflow_status("Complete") return operator_object.return_output_object() try: if identify_language: transcribe_job_config['IdentifyLanguage'] = True del transcribe_job_config["LanguageCode"] # Run the transcribe job. # The ** operator converts the job config dict to keyword arguments. response = transcribe.start_transcription_job(**transcribe_job_config) print(response) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(transcribe_error=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: if response["TranscriptionJob"][ "TranscriptionJobStatus"] == "IN_PROGRESS": operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( TranscribeJobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "FAILED": operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeJobId=job_id, TranscribeError=str( response["TranscriptionJob"]["FailureReason"])) raise MasExecutionError(operator_object.return_output_object()) elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "COMPLETE": operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( TranscribeJobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeJobId=job_id, TranscribeError="Unhandled error for this job: {job_id}". format(job_id=job_id)) raise MasExecutionError(operator_object.return_output_object())
def check_polly_webcaptions(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) webcaptions_object = WebCaptions(operator_object) print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: polly_collection = operator_object.metadata["PollyCollection"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyCollectionError="Missing a required metadata key {e}".format( e=e)) raise MasExecutionError(operator_object.return_output_object()) finished_tasks = 0 for caption in polly_collection: if caption["PollyStatus"] in ["completed", "failed", "not supported"]: finished_tasks = finished_tasks + 1 else: try: polly_response = polly.get_speech_synthesis_task( TaskId=caption["PollyTaskId"]) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyCollectionError= "Unable to get response from polly: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: polly_status = polly_response["SynthesisTask"]["TaskStatus"] print("The status from polly is:\n", polly_status) if polly_status in ["inProgress", "scheduled"]: operator_object.update_workflow_status("Executing") elif polly_status == "completed": # TODO: Store job details as metadata in dataplane finished_tasks = finished_tasks + 1 caption["PollyAudio"]["Uri"] = polly_response[ "SynthesisTask"]["OutputUri"] operator_object.update_workflow_status("Executing") elif polly_status == "failed": finished_tasks = finished_tasks + 1 operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyCollectionError="Polly returned as failed: {e}". format(e=str(polly_response["SynthesisTask"] ["TaskStatusReason"]))) raise MasExecutionError( operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyCollectionError="Polly returned as failed: {e}". format(e=str(polly_response["SynthesisTask"] ["TaskStatusReason"]))) raise MasExecutionError( operator_object.return_output_object()) # If all the Polly jobs are done then the operator is complete if finished_tasks == len(polly_collection): operator_object.update_workflow_status("Complete") webcaptions_object.PutWebCaptionsCollection("CaptionsCollection", polly_collection) operator_object.add_workflow_metadata(PollyCollection=polly_collection) return operator_object.return_output_object()
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: workflow_id = str(operator_object.workflow_execution_id) bucket = operator_object.input["Media"]["Video"]["S3Bucket"] key = operator_object.input["Media"]["Video"]["S3Key"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( MediaconvertError="Missing a required metadata key {e}".format( e=e)) raise MasExecutionError(operator_object.return_output_object()) # Adding in exception block for now since we aren't guaranteed an asset id will be present, should remove later try: asset_id = operator_object.asset_id except KeyError as e: print("No asset id passed in with this workflow", e) asset_id = '' file_input = "s3://" + bucket + "/" + key destination = "s3://" + bucket + "/" + 'private/assets/' + asset_id + "/workflows/" + workflow_id + "/" thumbnail_destination = "s3://" + bucket + "/" + 'private/assets/' + asset_id + "/" try: response = mediaconvert.describe_endpoints() except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: mediaconvert_endpoint = response["Endpoints"][0]["Url"] customer_mediaconvert = boto3.client( "mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) try: response = customer_mediaconvert.create_job( Role=mediaconvert_role, Settings={ "OutputGroups": [{ "Name": "File Group", "Outputs": [{ "ContainerSettings": { "Container": "MP4", "Mp4Settings": { "CslgAtom": "INCLUDE", "FreeSpaceBox": "EXCLUDE", "MoovPlacement": "PROGRESSIVE_DOWNLOAD" } }, "AudioDescriptions": [{ "AudioTypeControl": "FOLLOW_INPUT", "AudioSourceName": "Audio Selector 1", "CodecSettings": { "Codec": "AAC", "AacSettings": { "AudioDescriptionBroadcasterMix": "NORMAL", "Bitrate": 96000, "RateControlMode": "CBR", "CodecProfile": "LC", "CodingMode": "CODING_MODE_2_0", "RawFormat": "NONE", "SampleRate": 48000, "Specification": "MPEG4" } }, "LanguageCodeControl": "FOLLOW_INPUT" }], "Extension": "mp4", "NameModifier": "_audio" }], "OutputGroupSettings": { "Type": "FILE_GROUP_SETTINGS", "FileGroupSettings": { "Destination": destination } } }, { "CustomName": "thumbnail", "Name": "File Group", "Outputs": [{ "ContainerSettings": { "Container": "RAW" }, "VideoDescription": { "ScalingBehavior": "DEFAULT", "TimecodeInsertion": "DISABLED", "AntiAlias": "ENABLED", "Sharpness": 50, "CodecSettings": { "Codec": "FRAME_CAPTURE", "FrameCaptureSettings": { "FramerateNumerator": 1, "FramerateDenominator": 5, "MaxCaptures": 2, "Quality": 80 } }, "DropFrameTimecode": "ENABLED", "ColorMetadata": "INSERT" }, "NameModifier": "_thumbnail" }], "OutputGroupSettings": { "Type": "FILE_GROUP_SETTINGS", "FileGroupSettings": { "Destination": thumbnail_destination } } }], "AdAvailOffset": 0, "Inputs": [{ "AudioSelectors": { "Audio Selector 1": { "Offset": 0, "DefaultSelection": "DEFAULT", "ProgramSelection": 1 } }, "VideoSelector": { "ColorSpace": "FOLLOW" }, "FilterEnable": "AUTO", "PsiControl": "USE_PSI", "FilterStrength": 0, "DeblockFilter": "DISABLED", "DenoiseFilter": "DISABLED", "TimecodeSource": "EMBEDDED", "FileInput": file_input }] }) # TODO: Add support for boto client error handling except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: job_id = response['Job']['Id'] operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata(MediaconvertJobId=job_id, MediaconvertInputFile=key, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object()
def lambda_handler(event, context): operator_object = MediaInsightsOperationHelper(event) # Get operator parameters try: workflow_id = str(event["WorkflowExecutionId"]) asset_id = event['AssetId'] if "Video" in operator_object.input["Media"]: bucket = operator_object.input["Media"]["Video"]["S3Bucket"] key = operator_object.input["Media"]["Video"]["S3Key"] file_type = key.split('.')[-1] elif "Audio" in operator_object.input["Media"]: bucket = operator_object.input["Media"]["Audio"]["S3Bucket"] key = operator_object.input["Media"]["Audio"]["S3Key"] file_type = key.split('.')[-1] elif "Image" in operator_object.input["Media"]: bucket = operator_object.input["Media"]["Image"]["S3Bucket"] key = operator_object.input["Media"]["Image"]["S3Key"] file_type = key.split('.')[-1] elif "Text" in operator_object.input["Media"]: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] file_type = key.split('.')[-1] except Exception: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( GenericDataLookupError="No valid inputs") raise MasExecutionError(operator_object.return_output_object()) # Get the metadata filename print("Looking up metadata for s3://" + bucket + "/" + key) # Get user-defined location for generic data file if "Key" in operator_object.configuration: metadata_filename = operator_object.configuration["Key"] else: operator_object.add_workflow_metadata( GenericDataLookupError="Missing S3 key for data file.") operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) if "Bucket" in operator_object.configuration: metadata_bucket = operator_object.configuration["Bucket"] else: operator_object.add_workflow_metadata( GenericDataLookupError="Missing S3 bucket for data file.") operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) # Get metadata s3 = boto3.client('s3') try: print("Getting data from s3://" + metadata_bucket + "/" + metadata_filename) data = s3.get_object(Bucket=metadata_bucket, Key=metadata_filename) metadata_json = json.loads(data['Body'].read().decode('utf-8')) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( GenericDataLookupError="Unable read datafile. " + str(e)) raise MasExecutionError(operator_object.return_output_object()) # Verify that the metadata is a dict, as required by the dataplane if (type(metadata_json) != dict): operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( GenericDataLookupError="Metadata must be of type dict. Found " + str(type(metadata_json)) + " instead.") raise MasExecutionError(operator_object.return_output_object()) # Save metadata to dataplane operator_object.add_workflow_metadata(AssetId=asset_id, WorkflowExecutionId=workflow_id) dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, metadata_json) # Validate that the metadata was saved to the dataplane if "Status" not in metadata_upload: operator_object.add_workflow_metadata( GenericDataLookupError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: # Update the workflow status if metadata_upload["Status"] == "Success": print( "Uploaded metadata for asset: {asset}".format(asset=asset_id)) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( GenericDataLookupError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object())