def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="No valid inputs {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' try: source_lang = operator_object.configuration["SourceLanguageCode"] target_lang = operator_object.configuration["TargetLanguageCode"] except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Language codes are not defined") raise MasExecutionError(operator_object.return_output_object()) try: s3_response = s3.get_object(Bucket=bucket, Key=key) transcribe_metadata = json.loads( s3_response["Body"].read().decode("utf-8")) transcript = transcribe_metadata["results"]["transcripts"][0][ "transcript"] except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Unable to read transcription from S3: {e}".format( e=str(e))) raise MasExecutionError(operator_object.return_output_object()) # If input text is empty then we're done. if len(transcript) < 1: operator_object.update_workflow_status("Complete") return operator_object.return_output_object() # Tell the NLTK data loader to look for files in /tmp/ nltk.data.path.append("/tmp/") # Download NLTK tokenizers to /tmp/ # We use /tmp because that's where AWS Lambda provides write access to the local file system. nltk.download('punkt', download_dir='/tmp/') # Create language tokenizer according to user-specified source language. # Default to English. if source_lang == 'fr': print("Using French dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/french.pickle') elif source_lang == 'de': print("Using German dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/german.pickle') elif source_lang == 're': print("Using Russian dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle') elif source_lang == 'it': print("Using Italian dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle') elif source_lang == 'pt': print("Using Portuguese dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle') elif source_lang == 'es': print("Using Spanish dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') else: print("Using English dictionary to find sentence boundaries.") tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Split input text into a list of sentences sentences = tokenizer.tokenize(transcript) print("Input text length: " + str(len(transcript))) print("Number of sentences: " + str(len(sentences))) translated_text = '' transcript_chunk = '' for sentence in sentences: # Translate can handle 5000 unicode characters but we'll process no # more than 1000 just to be on the safe side. # Even by limiting input text to 3000 characters, we've still seen # translate throttling with a RateExceeded exception. # Reducing input text to 1000 characters seemed to fix this. if (len(sentence) + len(transcript_chunk) < 1000): transcript_chunk = transcript_chunk + ' ' + sentence else: try: print("Translation input text length: " + str(len(transcript_chunk))) translation_chunk = translate_client.translate_text( Text=transcript_chunk, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang) print("Translation output text length: " + str(len(translation_chunk))) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Unable to get response from translate: {e}" .format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) translated_text = translated_text + ' ' + translation_chunk[ "TranslatedText"] transcript_chunk = sentence print("Translating the final chunk of input text...") try: print("Translation input text length: " + str(len(transcript_chunk))) translation_chunk = translate_client.translate_text( Text=transcript_chunk, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang) print("Translation output text length: " + str(len(translation_chunk))) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranslateError="Unable to get response from translate: {e}".format( e=str(e))) raise MasExecutionError(operator_object.return_output_object()) translated_text = translated_text + ' ' + translation_chunk[ "TranslatedText"] # Put final result into a JSON object because the MIE dataplane requires it to be so. translation_result = {} translation_result["TranslatedText"] = translated_text translation_result["SourceLanguageCode"] = source_lang translation_result["TargetLanguageCode"] = target_lang print("Final translation text length: " + str(len(translated_text))) dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, translation_result) if "Status" not in metadata_upload: operator_object.add_workflow_metadata( TranslateError="Unable to upload metadata for asset: {asset}". format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload['Status'] == 'Success': operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key']) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( TranslateError="Unable to upload metadata for asset: {asset}". format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) # If Transcribe wasn't run due to silent audio, then we're done if "Mediainfo_num_audio_tracks" in event["Input"]["MetaData"] and event[ "Input"]["MetaData"]["Mediainfo_num_audio_tracks"] == "0": operator_object.update_workflow_status("Complete") return operator_object.return_output_object() try: job_id = operator_object.metadata["TranscribeJobId"] workflow_id = operator_object.workflow_execution_id asset_id = operator_object.asset_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: response = transcribe.get_transcription_job( TranscriptionJobName=job_id) print(response) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranscribeError=str(e), TranscribeJobId=job_id) raise MasExecutionError(operator_object.return_output_object()) else: if response["TranscriptionJob"][ "TranscriptionJobStatus"] == "IN_PROGRESS": operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( TranscribeJobId=job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "FAILED": operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeJobId=job_id, TranscribeError=str( response["TranscriptionJob"]["FailureReason"])) raise MasExecutionError(operator_object.return_output_object()) elif response["TranscriptionJob"][ "TranscriptionJobStatus"] == "COMPLETED": transcribe_uri = response["TranscriptionJob"]["Transcript"][ "TranscriptFileUri"] http = urllib3.PoolManager() transcription = http.request('GET', transcribe_uri) transcription_data = transcription.data.decode("utf-8") transcription_json = json.loads(transcription_data) text_only_transcript = '' for transcripts in transcription_json["results"]["transcripts"]: transcript = transcripts["transcript"] text_only_transcript = text_only_transcript.join(transcript) print(text_only_transcript) dataplane = DataPlane() s3 = boto3.client('s3') transcript_storage_path = dataplane.generate_media_storage_path( asset_id, workflow_id) key = transcript_storage_path['S3Key'] + "transcript.txt" bucket = transcript_storage_path['S3Bucket'] s3.put_object(Bucket=bucket, Key=key, Body=text_only_transcript) transcription_json["TextTranscriptUri"] = { "S3Bucket": bucket, "S3Key": key } metadata_upload = dataplane.store_asset_metadata( asset_id, operator_object.name, workflow_id, transcription_json) if "Status" not in metadata_upload: operator_object.add_workflow_metadata( TranscribeError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id), TranscribeJobId=job_id) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload['Status'] == 'Success': operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key']) operator_object.add_workflow_metadata( TranscribeJobId=job_id) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( TranscribeError= "Unable to upload metadata for asset: {asset}".format( asset=asset_id), TranscribeJobId=job_id) operator_object.update_workflow_status("Error") raise MasExecutionError( operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( TranscribeError="Unable to determine status") raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) # Get MediaConvert job id try: job_id = operator_object.metadata["MediaconvertJobId"] workflow_id = operator_object.workflow_execution_id input_file = operator_object.metadata["MediaconvertInputFile"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) # Get asset id try: asset_id = operator_object.asset_id except KeyError as e: print("No asset_id in this workflow") asset_id = '' # Get mediaconvert endpoint from cache if available if ("MEDIACONVERT_ENDPOINT" in os.environ): mediaconvert_endpoint = os.environ["MEDIACONVERT_ENDPOINT"] customer_mediaconvert = boto3.client("mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) else: try: response = mediaconvert.describe_endpoints() except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: mediaconvert_endpoint = response["Endpoints"][0]["Url"] # Cache the mediaconvert endpoint in order to avoid getting throttled on # the DescribeEndpoints API. os.environ["MEDIACONVERT_ENDPOINT"] = mediaconvert_endpoint customer_mediaconvert = boto3.client("mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) # Get MediaConvert job results try: response = customer_mediaconvert.get_job(Id=job_id) except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError=e, MediaconvertJobId=job_id) raise MasExecutionError(operator_object.return_output_object()) else: if response["Job"]["Status"] == 'IN_PROGRESS' or response["Job"]["Status"] == 'PROGRESSING': operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata(MediaconvertJobId=job_id, MediaconvertInputFile=input_file, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["Job"]["Status"] == 'COMPLETE': input_filename = os.path.splitext(operator_object.metadata["MediaconvertInputFile"].split("/")[-1])[0] # Get Thumbnail object thumbnail_output_uri = response["Job"]["Settings"]["OutputGroups"][0]["OutputGroupSettings"]["FileGroupSettings"]["Destination"] thumbnail_extension = response["Job"]["Settings"]["OutputGroups"][0]["Outputs"][0]["Extension"] thumbnail_modifier = response["Job"]["Settings"]["OutputGroups"][0]["Outputs"][0]["NameModifier"] thumbnail_bucket = thumbnail_output_uri.split("/")[2] thumbnail_folder = "/".join(thumbnail_output_uri.split("/")[3:-1]) thumbnail_key = thumbnail_folder + "/" + input_filename + thumbnail_modifier + "." + thumbnail_extension operator_object.add_media_object("Thumbnail", thumbnail_bucket, thumbnail_key) # Get audio object audio_output_uri = response["Job"]["Settings"]["OutputGroups"][1]["OutputGroupSettings"]["FileGroupSettings"]["Destination"] audio_extension = response["Job"]["Settings"]["OutputGroups"][1]["Outputs"][0]["Extension"] audio_modifier = response["Job"]["Settings"]["OutputGroups"][1]["Outputs"][0]["NameModifier"] audio_bucket = audio_output_uri.split("/")[2] audio_folder = "/".join(audio_output_uri.split("/")[3:-1]) audio_key = audio_folder + "/" + input_filename + audio_modifier + "." + audio_extension operator_object.add_media_object("Audio", audio_bucket, audio_key) operator_object.add_workflow_metadata(MediaconvertJobId=job_id) # Get mp4 proxy encode object proxy_encode_output_uri = response["Job"]["Settings"]["OutputGroups"][2]["OutputGroupSettings"]["FileGroupSettings"]["Destination"] proxy_encode_extension = response["Job"]["Settings"]["OutputGroups"][2]["Outputs"][0]["Extension"] proxy_encode_modifier = response["Job"]["Settings"]["OutputGroups"][2]["Outputs"][0]["NameModifier"] proxy_encode_bucket = proxy_encode_output_uri.split("/")[2] proxy_encode_folder = "/".join(proxy_encode_output_uri.split("/")[3:-1]) proxy_encode_key = proxy_encode_folder + "/" + input_filename + proxy_encode_modifier + "." + proxy_encode_extension operator_object.add_media_object("ProxyEncode", proxy_encode_bucket, proxy_encode_key) # Set workflow status complete operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( MediaconvertError="Unhandled exception, unable to get status from mediaconvert: {response}".format(response=response), MediaconvertJobId=job_id) raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: job_id = operator_object.metadata["MediaconvertJobId"] workflow_id = operator_object.workflow_execution_id input_file = operator_object.metadata["MediaconvertInputFile"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( MediaconvertError="Missing a required metadata key {e}".format( e=e)) raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError as e: print("No asset_id in this workflow") asset_id = '' try: response = mediaconvert.describe_endpoints() except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError=str(e)) raise MasExecutionError(operator_object.return_output_object()) else: mediaconvert_endpoint = response["Endpoints"][0]["Url"] customer_mediaconvert = boto3.client( "mediaconvert", region_name=region, endpoint_url=mediaconvert_endpoint) try: response = customer_mediaconvert.get_job(Id=job_id) except Exception as e: print("Exception:\n", e) operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(MediaconvertError=e, MediaconvertJobId=job_id) raise MasExecutionError(operator_object.return_output_object()) else: if response["Job"]["Status"] == 'IN_PROGRESS' or response["Job"][ "Status"] == 'PROGRESSING': operator_object.update_workflow_status("Executing") operator_object.add_workflow_metadata( MediaconvertJobId=job_id, MediaconvertInputFile=input_file, AssetId=asset_id, WorkflowExecutionId=workflow_id) return operator_object.return_output_object() elif response["Job"]["Status"] == 'COMPLETE': # TODO: Store job details as metadata in dataplane # TODO: Get output uri from dataplane output_uri = response["Job"]["Settings"]["OutputGroups"][0][ "OutputGroupSettings"]["FileGroupSettings"]["Destination"] extension = response["Job"]["Settings"]["OutputGroups"][0][ "Outputs"][0]["Extension"] modifier = response["Job"]["Settings"]["OutputGroups"][0][ "Outputs"][0]["NameModifier"] bucket = output_uri.split("/")[2] folder = "/".join(output_uri.split("/")[3:-1]) file_name = os.path.splitext( operator_object.metadata["MediaconvertInputFile"])[0].split( "/")[-1] key = folder + "/" + file_name + modifier + "." + extension operator_object.add_media_object("Audio", bucket, key) operator_object.add_workflow_metadata(MediaconvertJobId=job_id) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( MediaconvertError= "Unhandled exception, unable to get status from mediaconvert: {response}" .format(response=response), MediaconvertJobId=job_id) raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got this event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: task_id = operator_object.metadata["PollyJobId"] workflow_id = operator_object.workflow_execution_id asset_id = operator_object.asset_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: polly_response = polly.get_speech_synthesis_task(TaskId=task_id) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyError="Unable to get response from polly: {e}".format( e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: polly_status = polly_response["SynthesisTask"]["TaskStatus"] print("The status from polly is:\n", polly_status) if polly_status == "inProgress": polly_job_id = polly_response["SynthesisTask"]["TaskId"] operator_object.add_workflow_metadata( PollyJobId=polly_job_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) operator_object.update_workflow_status("Executing") return operator_object.return_output_object() elif polly_status == "completed": # TODO: Store job details as metadata in dataplane uri = polly_response["SynthesisTask"]["OutputUri"] file = uri.split("/")[5] folder = uri.split("/")[4] bucket = uri.split("/")[3] key = folder + "/" + file operator_object.add_workflow_metadata(PollyJobId=task_id) operator_object.add_media_object("Audio", bucket, key) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() elif polly_status == "scheduled": operator_object.add_workflow_metadata( PollyJobId=task_id, AssetId=asset_id, WorkflowExecutionId=workflow_id) operator_object.update_workflow_status("Executing") return operator_object.return_output_object() elif polly_status == "failed": operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyError="Polly returned as failed: {e}".format(e=str( polly_response["SynthesisTask"]["TaskStatusReason"]))) raise MasExecutionError(operator_object.return_output_object()) else: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( PollyError="Polly returned as failed: {e}".format(e=str( polly_response["SynthesisTask"]["TaskStatusReason"]))) raise MasExecutionError(operator_object.return_output_object())
def lambda_handler(event, context): print("We got the following event:\n", event) operator_object = MediaInsightsOperationHelper(event) try: bucket = operator_object.input["Media"]["Text"]["S3Bucket"] key = operator_object.input["Media"]["Text"]["S3Key"] except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError="No valid inputs {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: workflow_id = operator_object.workflow_execution_id except KeyError as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError="Missing a required metadata key {e}".format(e=e)) raise MasExecutionError(operator_object.return_output_object()) try: asset_id = operator_object.asset_id except KeyError: print('No asset id for this workflow') asset_id = '' try: source_lang = operator_object.configuration["SourceLanguageCode"] target_lang = operator_object.configuration["TargetLanguageCode"] except KeyError: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError="Language codes are not defined") raise MasExecutionError(operator_object.return_output_object()) try: s3_response = s3.get_object(Bucket=bucket, Key=key) transcribe_metadata = json.loads(s3_response["Body"].read().decode("utf-8")) transcript = transcribe_metadata["results"]["transcripts"][0]["transcript"] except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError="Unable to read transcription from S3: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) try: translation = translate_client.translate_text( Text=transcript, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang ) except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata(TranslateError="Unable to get response from translate: {e}".format(e=str(e))) raise MasExecutionError(operator_object.return_output_object()) else: dataplane = DataPlane() metadata_upload = dataplane.store_asset_metadata(asset_id, operator_object.name, workflow_id, translation) if "Status" not in metadata_upload: operator_object.add_workflow_metadata( TranslateError="Unable to upload metadata for asset: {asset}".format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object()) else: if metadata_upload['Status'] == 'Success': operator_object.add_media_object('Text', metadata_upload['Bucket'], metadata_upload['Key']) operator_object.update_workflow_status("Complete") return operator_object.return_output_object() else: operator_object.add_workflow_metadata( TranslateError="Unable to upload metadata for asset: {asset}".format(asset=asset_id)) operator_object.update_workflow_status("Error") raise MasExecutionError(operator_object.return_output_object())