Example #1
0
def lambda_handler(event, context):
    session = boto3.session.Session()
    region = session.region_name

    # Default to unsuccessful
    isSuccessful = "FALSE"

    # Create a random name for the transcription job
    jobname = id_generator()

    # Extract the bucket and key from the downloadPodcast lambda function
    bucket = event['audioS3Location']['bucket']
    key = event['audioS3Location']['key']

    content_type = event['audio_type']
    if content_type not in CONTENT_TYPE_TO_MEDIA_FORMAT:
        raise InvalidInputError(content_type + " is not supported audio type.")
    media_type = CONTENT_TYPE_TO_MEDIA_FORMAT[content_type]
    logger.info("media type: " + content_type)

    # Assemble the url for the object for transcribe. It must be an s3 url in the region
    url = "https://s3-" + region + ".amazonaws.com/" + bucket + "/" + key

    try:
        showSpeakerLabels = (int(event['speakers']) > 1)

        settings = {
            'VocabularyName': event['vocabularyInfo']['name'],
            'ShowSpeakerLabels': False
        }

        if int(event['speakers']) > 1:
            settings['ShowSpeakerLabels'] = True
            settings['MaxSpeakerLabels'] = int(event['speakers'])

        # Call the AWS SDK to initiate the transcription job.
        response = client.start_transcription_job(TranscriptionJobName=jobname,
                                                  LanguageCode='en-US',
                                                  Settings=settings,
                                                  MediaFormat=media_type,
                                                  Media={'MediaFileUri': url})
        isSuccessful = "TRUE"
    except client.exceptions.BadRequestException as e:
        # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit,
        # return unsuccessful and the step function will retry.
        logger.error(str(e))
        raise ThrottlingException(e)
    except client.exceptions.LimitExceededException as e:
        # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit,
        # return unsuccessful and the step function will retry.
        logger.error(str(e))
        raise ThrottlingException(e)
    except client.exceptions.ClientError as e:
        # Return the transcription job and the success code
        # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit,
        # return unsuccessful and the step function will retry.
        logger.error(str(e))
        raise ThrottlingException(e)
    return {"success": isSuccessful, "transcribeJob": jobname}
def lambda_handler(event, context):
    print("Received event" + json.dumps(event, indent=4))

    session = boto3.session.Session()
    region = session.region_name

    # Default to unsuccessful
    isSuccessful = "FALSE"

    # Create a random name for the transcription job
    jobname = id_generator()

    # Extract the bucket and key from the lambda function
    bucket = event['bucket']
    key = event['key']
    content_type = event['audio_type']
    if content_type not in CONTENT_TYPE_TO_MEDIA_FORMAT:
        raise InvalidInputError(content_type + " is not supported audio type.")

    media_type = CONTENT_TYPE_TO_MEDIA_FORMAT[content_type]
    logger.info("media type: " + content_type)

    # Assemble the url for the object for transcribe. It must be an s3 url in the region
    url = "https://s3-" + region + ".amazonaws.com/" + bucket + "/" + key

    try:
        settings = {'ChannelIdentification': True}

        print('url: ' + url)

        # Call the AWS SDK to initiate the transcription job.
        response = client.start_transcription_job(TranscriptionJobName=jobname,
                                                  LanguageCode=LANGUAGE_CODE,
                                                  Settings=settings,
                                                  MediaFormat=media_type,
                                                  Media={'MediaFileUri': url})
        isSuccessful = "TRUE"
    except client.exceptions.BadRequestException as e:
        # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit,
        # return unsuccessful and the step function will retry.
        logger.error(str(e))
        raise ThrottlingException(e)
    except client.exceptions.LimitExceededException as e:
        # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit,
        # return unsuccessful and the step function will retry.
        logger.error(str(e))
        raise ThrottlingException(e)
    except client.exceptions.ClientError as e:
        # Return the transcription job and the success code
        # There is a limit to how many transcribe jobs can run concurrently. If you hit this limit,
        # return unsuccessful and the step function will retry.
        logger.error(str(e))
        raise ThrottlingException(e)
    return {"success": isSuccessful, "transcribeJob": jobname}
Example #3
0
def lambda_handler(event, context):
    url = event['podcastUrl']
    bucket = event['bucket']
    content_type = event['audio_type']

    # generate a temp file name to store in S3
    key = 'podcasts/audio/' + id_generator() + "-" + os.path.basename(url)

    try:
        logger.info("downloading from: " + url)

        # Open the url
        stream = urlopen(url)

        s3_object_metadata = {'href': url}

        logger.info("writing to s3://" + bucket + "/" + key)
        s3_client.upload_fileobj(
            Fileobj=stream,
            Bucket=bucket,
            Key=key,
            ExtraArgs={
                "Metadata": s3_object_metadata,
                'ContentType': content_type
            }
        )
        logger.info("done writing to s3://" + bucket + "/" + key)

        # Return the bucket and key the location of the podcast file stored in S3
        return {
            "bucket": bucket,
            "key": key
        }

    # handle errors
    except HTTPError as e:
        logger.error("HTTPError downloading:" + url)
        logger.exception(str(e))
        raise e
    except URLError as e:
        logger.error("URLError downloading:" + url)
        logger.exception(str(e))
        raise e
    except Exception as e:
        logger.error("Unexpected error:")
        logger.exception(str(e))
        raise e
def lambda_handler(event, context):
    """
    The first lambda function that runs, triggered by a DynamoDB Transcripts table event
    Starts the state machine and gives it the key for audio file stored in S3 for audio transcription
    Does not return any value for another lambda function
    """

    for record in event.get('Records'):
        if record.get('eventName') in ('INSERT', 'MODIFY'):

            # Retrieve the item attributes from the stream record
            Id = record['dynamodb']['NewImage']['id']['S']
            Procedure = record['dynamodb']['NewImage']['procedure']['S']
            BucketName = record['dynamodb']['NewImage']['fileData']['M'][
                'bucketName']['S']
            BucketKey = record['dynamodb']['NewImage']['fileData']['M'][
                'bucketKey']['S']
            Jurisdiction = record['dynamodb']['NewImage']['jurisdiction']['S']
            Description = record['dynamodb']['NewImage']['description']['S']
            FileType = record['dynamodb']['NewImage']['fileType']['S']
            FileName = record['dynamodb']['NewImage']['fileName']['S']

            request_params = {
                "dynamoId": Id,
                "bucketName": BucketName,
                "bucketKey": BucketKey,
                "jurisdiction": Jurisdiction,
                "description": Description,
                "procedure": Procedure,
                "fileType": FileType,
                "fileName": FileName
            }

            response = STEPFUNCTIONS_CLIENT.start_execution(
                stateMachineArn=STEPFUNCTIONS_ARN,
                name=id_generator(),
                input=json.dumps(request_params,
                                 indent=4,
                                 sort_keys=True,
                                 default=str))
        else:
            print("Should only expect insert/modify DynamoDB operations")
def process_transcript(transcription_url, agent_name='', agent_arn=''):
    custom_vocabs = None

    response = urlopen(transcription_url)
    output = response.read()
    json_data = json.loads(output)
    logger.info(json_data)

    # customer
    customer_transcriptions = []
    # センテンスを作成する
    # 1 秒未満に続いた単語は同じセンテンスとし、1 秒以上空いた音声は別センテンスとする
    for d in json_data['results']['channel_labels']['channels'][0]['items']:
        if 'start_time' not in d:
            pass
        elif customer_transcriptions == [] or float(d['start_time']) - float(
                customer_transcriptions[-1]['end_time']) >= 1:
            customer_transcriptions.append({
                'job_name':
                json_data['jobName'],
                'person':
                'customer',
                'start_time':
                d['start_time'],
                'end_time':
                d['end_time'],
                'content':
                d['alternatives'][0]['content'],
                'detail_flag':
                True
            })
        elif float(d['start_time']) - float(
                customer_transcriptions[-1]['end_time']) < 1:  # 1秒未満
            customer_transcriptions[-1]['end_time'] = d['end_time']
            customer_transcriptions[-1]['content'] += d['alternatives'][0][
                'content']

    for customer_transcription in customer_transcriptions:
        customer_transcription['start_time'] = int(
            float(customer_transcription['start_time']) * 1000)
        customer_transcription['end_time'] = int(
            float(customer_transcription['end_time']) * 1000)
    for i, customer_transcription in enumerate(customer_transcriptions):
        customer_result = detect_all(customer_transcription['content'])
        # res = comprehend.detect_sentiment(Text=customer_transcription['content'],LanguageCode=LANGUAGE_CODE)
        for key in customer_result.keys():
            customer_transcriptions[i][key] = customer_result[key]

    # agent
    agent_transcriptions = []
    # センテンスを作成する
    # 1 秒未満に続いた単語は同じセンテンスとし、1 秒以上空いた音声は別センテンスとする
    for d in json_data['results']['channel_labels']['channels'][1]['items']:
        if 'start_time' not in d:
            pass
        elif agent_transcriptions == [] or float(d['start_time']) - float(
                agent_transcriptions[-1]['end_time']) >= 1:
            agent_transcriptions.append({
                'job_name':
                json_data['jobName'],
                'person':
                'agent',
                'start_time':
                d['start_time'],
                'end_time':
                d['end_time'],
                'content':
                d['alternatives'][0]['content'],
                'agent_arn':
                agent_arn,
                'agent_name':
                agent_name,
                'detail_flag':
                True
            })
        elif float(d['end_time']) - float(
                agent_transcriptions[-1]['end_time']) < 1:
            agent_transcriptions[-1]['end_time'] = d['end_time']
            agent_transcriptions[-1]['content'] += d['alternatives'][0][
                'content']
    for agent_transcription in agent_transcriptions:
        agent_transcription['start_time'] = int(
            float(agent_transcription['start_time']) * 1000)
        agent_transcription['end_time'] = int(
            float(agent_transcription['end_time']) * 1000)
    for i, agent_transcription in enumerate(agent_transcriptions):
        agent_result = detect_all(agent_transcription['content'])
        # res = comprehend.detect_sentiment(Text=agent_transcription['content'],LanguageCode=LANGUAGE_CODE)
        for key in agent_result.keys():
            agent_transcriptions[i][key] = agent_result[key]

    # 全体のtranscription
    ## agent
    agent_content = ''
    for item in json_data['results']['channel_labels']['channels'][1]['items']:
        agent_content += item['alternatives'][0]['content']
    agent_content = agent_content.replace(' ', '')
    ## customer
    customer_content = ''
    for item in json_data['results']['channel_labels']['channels'][0]['items']:
        customer_content += item['alternatives'][0]['content']
    customer_content = customer_content.replace(' ', '')
    ## whole
    whole_transcription = {
        'whole_transcript':
        json_data['results']['transcripts'][0]['transcript'].replace(' ', ''),
        'agent_transcript':
        agent_content,
        'customer_transcript':
        customer_content,
        'job_name':
        json_data['jobName'],
        'agent_arn':
        agent_arn,
        'agent_name':
        agent_name,
        'detail_flag':
        False,
    }
    whole_detect_result = detect_all(whole_transcription['whole_transcript'])
    for key in whole_detect_result.keys():
        whole_transcription['whole_' + key] = whole_detect_result[key]

    agent_detect_result = detect_all(whole_transcription['agent_transcript'])
    for key in agent_detect_result.keys():
        whole_transcription['agent_' + key] = agent_detect_result[key]

    customer_detect_result = detect_all(
        whole_transcription['customer_transcript'])
    for key in customer_detect_result.keys():
        whole_transcription['customer_' + key] = customer_detect_result[key]

    # s3upload
    transcript_locations = []

    # customer
    for customer_transcription in customer_transcriptions:
        key = 'callrecords/transcript/sentence/customer/' + id_generator(
        ) + '.json'
        response = s3_client.put_object(Body=json.dumps(customer_transcription,
                                                        indent=2),
                                        Bucket=BUCKET,
                                        Key=key)
        logger.info(json.dumps(response, indent=2))
        logger.info("successfully written transcript to s3://" + BUCKET + "/" +
                    key)

        # Return the bucket and key of the transcription / comprehend result.
        transcript_locations.append({"bucket": BUCKET, "key": key})
    # agent
    for agent_transcription in agent_transcriptions:
        key = 'callrecords/transcript/sentence/agent/' + id_generator(
        ) + '.json'
        response = s3_client.put_object(Body=json.dumps(agent_transcription,
                                                        indent=2),
                                        Bucket=BUCKET,
                                        Key=key)
        logger.info(json.dumps(response, indent=2))
        logger.info("successfully written transcript to s3://" + BUCKET + "/" +
                    key)

        # Return the bucket and key of the transcription / comprehend result.
        transcript_locations.append({"bucket": BUCKET, "key": key})
    # コール全体のjson保存
    key = 'callrecords/transcript/whole/json/' + id_generator() + '.json'
    response = s3_client.put_object(Body=json.dumps(whole_transcription,
                                                    indent=2),
                                    Bucket=BUCKET,
                                    Key=key)
    logger.info(json.dumps(response, indent=2))
    logger.info("successfully written transcript to s3://" + BUCKET + "/" +
                key)
    transcript_locations.append({"bucket": BUCKET, "key": key})

    logger.info('return value:')
    logger.info(transcript_locations)
    return transcript_locations
def lambda_handler(event, context):
    print("Received event: " + json.dumps(event, indent=2))

    # Pull the bucket name from the environment variable set in the cloudformation stack
    bucket = os.environ['BUCKET_NAME']
    retval = []
    paragraphs = []

    # Pull the signed URL for the payload of the transcription job
    transcriptionUrl = event['transcribeStatus']['transcriptionUrl']

    response = s3_client.get_object(
        Bucket=event["vocabularyInfo"]['mapping']['bucket'],
        Key=event["vocabularyInfo"]['mapping']['key'])
    file_content = response['Body'].read().decode('utf-8')

    mapping = json.loads(file_content)
    print("Received mapping: " + json.dumps(mapping, indent=2))

    # Open the transcription job payload.
    f = urlopen(transcriptionUrl)
    j = json.loads(f.read())

    # Here is the JSON returned by the Amazon Transcription SDK
    # {
    #  "jobName":"JobName",
    #  "accountId":"Your AWS Account Id",
    #  "results":{
    #    "transcripts":[
    #        {
    #            "transcript":"ah ... this is the text of the transcript"
    #        }
    #    ],
    #    "items":[
    #        {
    #            "start_time":"0.630",
    #            "end_time":"5.620",
    #            "alternatives": [
    #                {
    #                    "confidence":"0.7417",
    #                    "content":"ah"
    #                }
    #            ],
    #            "type":"pronunciation"
    #        }
    #     ]
    #  }

    # Pull the items from the transcription. Each word will be its own item with a start and endtime
    items = j["results"]["items"]

    # We would like to determine the key phrases in the transcript to so we can search on common phrases
    # rather than a single word at a time. In order to maintain the relationship between the time
    # the text is spoken and search on it, we need to pass each phrase individually along with its
    # timestamp so we retain that relationship. We will use comprehend to extract the ckey phrases from
    # the text.

    contents = ""
    timedata = []

    prevEndTime = -1
    paragraphGap = 1.5
    prevStartTime = -1
    newParagraph = False
    prevSpeaker = 'spk_0'

    hasSpeakerLabels = False
    speakerMapping = []

    # Create a mapping of the transitions from one speaker to another
    if 'speaker_labels' in j['results']:
        hasSpeakerLabels = True
        for i in range(len(j['results']['speaker_labels']['segments'])):
            speakerLabel = j['results']['speaker_labels']['segments'][i]
            speakerMapping.append({
                "speakerLabel": speakerLabel['speaker_label'],
                "endTime": float(speakerLabel['end_time'])
            })

    speakerIndex = 0

    # Repeat the loop for each item (word and punctuation)
    # The transcription will be broken out into a number of sections that are referred to
    # below as paragraphs. The paragraph is the unit text that is stored in the
    # elasticsearch index. It is broken out by punctionation, speaker changes, a long pause
    # in the audio, or overall length
    for i in range(len(items)):
        reason = ""

        # If the transcription detected the end of a sentence, we'll
        if items[i]['type'] == 'punctuation':
            if items[i]["alternatives"][0]["content"] == '.':
                newParagraph = True

            # Always assume the first guess is right.
            contents += items[i]["alternatives"][0]["content"]

        # Add the start time to the string -> timedata
        if 'start_time' in items[i]:
            speakerLabel = 'spk_0'

            if prevStartTime == -1:
                prevStartTime = float(items[i]["start_time"])

            # gap refers to the amount of time between spoken words
            gap = float(items[i]["start_time"]) - prevEndTime

            if hasSpeakerLabels:
                while speakerIndex < (len(speakerMapping) -
                                      1) and speakerMapping[
                                          speakerIndex + 1]['endTime'] < float(
                                              items[i]["start_time"]):
                    speakerIndex += 1

                speakerLabel = speakerMapping[speakerIndex]['speakerLabel']

            # Change paragraphs if the speaker changes
            if speakerLabel != prevSpeaker:
                newParagraph = True
                reason = "Speaker Change from " + prevSpeaker + " to " + speakerLabel
            # the gap exceeds a preset threshold
            elif gap > paragraphGap:
                newParagraph = True
                reason = "Time gap"
            # There are over 4900 words (The limit for comprehend is 5000)
            elif len(contents) > 4900:
                newParagraph = True
                reason = "Long paragraph"
            else:
                newParagraph = False

            if prevEndTime != -1 and newParagraph:

                # append the block of text to the array. Call comprehend to get
                # the keyword tags for this block of text
                retval.append({
                    "startTime": prevStartTime,
                    "endTime": prevEndTime,
                    "text": contents,
                    "gap": gap,
                    "tags": run_comprehend(contents),
                    "reason": reason,
                    "speaker": prevSpeaker,
                    "len": len(contents)
                })
                # Reset the contents and the time mapping
                # print('paragraph:' + contents)
                contents = ""
                timedata = []
                prevEndTime = -1
                prevStartTime = -1
                newParagraph = False
            else:
                prevEndTime = float(items[i]["end_time"])

            prevSpeaker = speakerLabel

            # If the contents is not empty, prepend a space
            if contents != "":
                contents += " "

            # Always assume the first guess is right.
            word = items[i]["alternatives"][0]["content"]

            # Map the custom words back to their original text
            for key in mapping:
                val = mapping[key]
                word = word.replace(key, val)

            contents += word

    # Run Comprehend on the remaining text
    # run_comprehend(contents, timedata, retval)

    retval.append({
        "startTime": prevStartTime,
        "endTime": prevEndTime,
        "text": contents,
        "tags": run_comprehend(contents),
        "speaker": prevSpeaker
    })

    # Create a payload for the output of the transcribe and comprehend API calls. There's a limit on the
    # amount of data stored in a step function payload, so we will use S3 to store the payload instead.
    # This can get to be pretty big.
    key = 'podcasts/keywords/' + id_generator() + '.json'
    # store retval to s3
    response = s3_client.put_object(Body=json.dumps(retval, indent=2),
                                    Bucket=bucket,
                                    Key=key)

    print("Return Value: " + json.dumps(retval, indent=2))

    # Return the bucket and key of the transcription / comprehend result.
    return {"bucket": bucket, "key": key}
Example #7
0
def lambda_handler(event, context):
    logger.info("Received event: " + json.dumps(event, indent=2))
    feed_url = event['rss']
    max_episodes_to_process = None
    if 'maxEpisodesToProcess' in event:
        max_episodes_to_process = int(event['maxEpisodesToProcess'])

    maxConcurrentEpisodes = 10

    # Open the url and process the RSS feed
    retval = []
    bucket = os.environ['BUCKET_NAME']

    episode_count = 0

    # This array holds the entity types that are included in the custom vocabulary
    vocabularyTypes = [
        'COMMERCIAL_ITEM', 'EVENT', 'LOCATION', 'ORGANIZATION', 'TITLE'
    ]
    vocabularyItems = []

    try:
        filename = '/tmp/' + id_generator() + '.rss'
        # HTTP GET the RSS feed XML file
        f = urlopen(feed_url)

        # Open our local file for writing
        with open(filename, "wb") as local_file:
            local_file.write(f.read())

        # The RSS feed is an XML file, so parse it and traverse the tree and pull all the /channel/items
        tree = ET.parse(filename)
        root = tree.getroot()

        # Extract the title of the podcast
        channelTitle = root.find('channel/title')

        for child in root.findall('channel/item'):
            title = child.find('title')
            envelope = child.find('enclosure')

            date_entry = child.find('pubDate').text
            dt = parser.parse(date_entry)
            date_string = dt.strftime("%Y:%m:%d %H:%M:%S")

            keywords = []

            description = child.find('description').text
            description = description[0:4900]

            comprehendResponse = client.detect_entities(Text=description,
                                                        LanguageCode='en')

            # we estimate the number of speakers in the podcast by parsing people names from the episode summary
            speaker_list = []
            for i in range(len(comprehendResponse["Entities"])):
                entity = comprehendResponse["Entities"][i]

                # For every person mentioned in the description, increment the number of
                # speakers. This is making the assumption that the episode text will
                # mention all the speakers and not include mentions to people that
                # are not in the podcast.
                # Is isn't critical that this number is correct, it is simply used to break
                # up the body of the podcast into smaller chunks. If the speaker detection
                # is inaccurate, it doesn't have a major impact on the functionality of
                # the system.
                if entity['Type'] == 'PERSON':
                    if not entity['Text'].startswith('@'):
                        speaker_list.append(entity['Text'])
                    else:
                        logger.info(f'skipping person {entity["Text"]}')
                # add to vocabulary if not already in there
                if entity['Type'] in vocabularyTypes and not entity[
                        'Text'] in vocabularyItems:
                    cleanText = entity['Text'].replace('@', '')
                    cleanText = cleanText.replace('.', '')
                    if cleanText:
                        vocabularyItems.append(cleanText)

            duplicates = find_duplicate_person(speaker_list)
            for d in duplicates:
                speaker_list.remove(d)
            num_speakers = len(speaker_list)

            # If there is an envelope, the link will point to an audio file
            if envelope != None:
                episode_url = envelope.attrib['url']
                file_type = envelope.attrib["type"]
                episode_count += 1

                episode = {
                    'Episode': title.text,
                    'PodcastName': channelTitle.text,
                    'podcastUrl': episode_url,
                    'audioType': file_type,
                    'tags': keywords,
                    'speakers': num_speakers,
                    'speakerNames': speaker_list,
                    'status': 'PENDING',
                    'publishedTime': date_string,
                    'summary': description,
                    'sourceFeed': feed_url
                }

                logger.debug(json.dumps(episode, indent=2))

                if "dryrun" in event:
                    episode["dryrun"] = event["dryrun"]
                # Add this item to the collection
                retval.append(episode)

            if max_episodes_to_process is not None and episode_count >= max_episodes_to_process:
                break

    # handle errors
    except HTTPError as e:
        print("HTTP Error:", e.code, feed_url)
        raise InvalidInputError("Unable to download RSS feed: " + feed_url)
    except URLError as e:
        print("URL Error:", e.reason, feed_url)
        raise InvalidInputError("Unable to download RSS feed: " + feed_url)

    logger.info(json.dumps(retval, indent=2))

    # This connection can be pretty big and exceed the capacity of the Step Function state data, so we store it
    # in S3 instead and return a link to the S3 file.
    s3_client = boto3.client('s3')
    key = 'podcasts/episodelist/' + id_generator() + '.json'
    response = s3_client.put_object(Body=json.dumps(
        {
            "maxConcurrentEpisodes": maxConcurrentEpisodes,
            "episodes": retval
        },
        indent=2),
                                    Bucket=bucket,
                                    Key=key)

    event['episodes'] = {
        "status": 'RUNNING',
        "remainingEpisodes": episode_count,
        "bucket": bucket,
        "key": key
    }
    event['customVocabulary'] = vocabularyItems

    # Return the link to the episode JSON document and the custom vocabulary items.
    return event
Example #8
0
                break

    # handle errors
    except HTTPError, e:
        print("HTTP Error:", e.code, feed_url)
        raise InvalidInputError("Unable to download RSS feed: " + feed_url)
    except URLError, e:
        print("URL Error:", e.reason, feed_url)
        raise InvalidInputError("Unable to download RSS feed: " + feed_url)

    logger.info(json.dumps(retval, indent=2))

    # This connection can be pretty big and exceed the capacity of the Step Function state data, so we store it
    # in S3 instead and return a link to the S3 file.
    s3_client = boto3.client('s3')
    key = 'podcasts/episodelist/' + id_generator() + '.json'
    response = s3_client.put_object(Body=json.dumps(
        {
            "maxConcurrentEpisodes": maxConcurrentEpisodes,
            "episodes": retval
        },
        indent=2),
                                    Bucket=bucket,
                                    Key=key)

    event['episodes'] = {
        "status": 'RUNNING',
        "remainingEpisodes": episode_count,
        "bucket": bucket,
        "key": key
    }
Example #9
0
def process_transcript(transcription_url, podcast_url, vocabulary_info):
    custom_vocabs = None
    if "mapping" in vocabulary_info:
        try:
            vocab_mapping_bucket = vocabulary_info['mapping']['bucket']
            key = vocabulary_info['mapping']['key']
            obj = s3_client.get_object(Bucket=vocab_mapping_bucket, Key=key)
            custom_vocabs = json.loads(obj['Body'].read())
            logger.info("key:" + key)
            logger.info("using custom vocab mapping: \n" +
                        json.dumps(custom_vocabs, indent=2))
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                raise InvalidInputError(
                    "The S3 file for custom vocab list does not exist.")
            else:
                raise

    # job_status_response = transcribe_client.get_transcription_job(TranscriptionJobName=transcribe_job_id)
    response = urlopen(transcription_url)
    output = response.read()
    json_data = json.loads(output)

    logger.debug(json.dumps(json_data, indent=4))
    results = json_data['results']
    # free up memory
    del json_data

    comprehend_chunks, paragraphs = chunk_up_transcript(custom_vocabs, results)

    start = time.time()
    detected_entities_response = comprehend.batch_detect_entities(
        TextList=comprehend_chunks, LanguageCode='en')
    round_trip = time.time() - start
    logger.info('End of batch_detect_entities. Took time {:10.4f}\n'.format(
        round_trip))

    entities = parse_detected_entities_response(detected_entities_response, {})
    entities_as_list = {}
    for entity_type in entities:
        entities_as_list[entity_type] = list(entities[entity_type])

    clean_up_entity_results(entities_as_list)
    print(json.dumps(entities_as_list, indent=4))

    # start = time.time()
    # detected_phrase_response = comprehend.batch_detect_key_phrases(TextList=comprehend_chunks, LanguageCode='en')
    # round_trip = time.time() - start
    # logger.info('End of batch_detect_key_phrases. Took time {:10.4f}\n'.format(round_trip))

    # key_phrases = parse_detected_key_phrases_response(detected_phrase_response)
    # logger.debug(json.dumps(key_phrases, indent=4))

    doc_to_update = {'transcript': paragraphs}
    doc_to_update['transcript_entities'] = entities_as_list
    logger.info(json.dumps(doc_to_update, indent=4))
    # doc_to_update['key_phrases'] = key_phrases
    key = 'podcasts/transcript/' + id_generator() + '.json'

    response = s3_client.put_object(Body=json.dumps(doc_to_update, indent=2),
                                    Bucket=bucket,
                                    Key=key)
    logger.info(json.dumps(response, indent=2))

    logger.info("successfully written transcript to s3://" + bucket + "/" +
                key)
    # Return the bucket and key of the transcription / comprehend result.
    transcript_location = {"bucket": bucket, "key": key}
    return transcript_location
def process_transcript(transcription_url):
    custom_vocabs = None

    response = urlopen(transcription_url)
    output = response.read()
    json_data = json.loads(output)

    logger.debug(json.dumps(json_data, indent=4))
    results = json_data['results']
    # free up memory
    del json_data

    comprehend_chunks, paragraphs = chunk_up_transcript(custom_vocabs, results)

    key_phrases = ''
    entities_as_list = {}

    if comprehend_chunks is not None and len(comprehend_chunks) > 0:
        start = time.time()
        detected_entities_response = comprehend.batch_detect_entities(
            TextList=comprehend_chunks, LanguageCode=LANGUAGE_CODE)
        round_trip = time.time() - start
        logger.info(
            'End of batch_detect_entities. Took time {:10.4f}\n'.format(
                round_trip))

        entities = parse_detected_entities_response(detected_entities_response,
                                                    {})

        for entity_type in entities:
            entities_as_list[entity_type] = list(entities[entity_type])

        clean_up_entity_results(entities_as_list)
        print(json.dumps(entities_as_list, indent=4))

        start = time.time()
        detected_phrase_response = comprehend.batch_detect_key_phrases(
            TextList=comprehend_chunks, LanguageCode=LANGUAGE_CODE)
        round_trip = time.time() - start
        logger.info(
            'End of batch_detect_key_phrases. Took time {:10.4f}\n'.format(
                round_trip))

        key_phrases = parse_detected_key_phrases_response(
            detected_phrase_response)
        logger.debug(json.dumps(key_phrases, indent=4))

    agentTranscript = ''

    #Agent is channel 1 now...
    for item in results['channel_labels']['channels'][1]['items']:
        if item['type'] == 'punctuation':
            filler = ''
        else:
            filler = ' '
        agentTranscript += filler + item['alternatives'][0]['content']

    customerTranscript = ''

    # Customer is channel 0 now...
    for item in results['channel_labels']['channels'][0]['items']:
        if item['type'] == 'punctuation':
            filler = ''
        else:
            filler = ' '
        customerTranscript += filler + item['alternatives'][0]['content']

    agent = [agentTranscript]
    customer = [customerTranscript]
    agent_entities_as_list = {}
    detected_agent_phrase_response = ''
    agent_key_phrases = ''
    agent_sentiment = ''

    if agent[0] != '':
        detected_agent_entities_response = comprehend.batch_detect_entities(
            TextList=agent[0:24], LanguageCode=LANGUAGE_CODE)
        round_trip = time.time() - start
        logger.info(
            'End of batch_detect_entities. Took time {:10.4f}\n'.format(
                round_trip))

        agent_entities = parse_detected_entities_response(
            detected_agent_entities_response, {})

        for entity_type in agent_entities:
            agent_entities_as_list[entity_type] = list(
                agent_entities[entity_type])

        clean_up_entity_results(agent_entities_as_list)
        print(json.dumps(agent_entities_as_list, indent=4))

        start = time.time()
        detected_agent_phrase_response = comprehend.batch_detect_key_phrases(
            TextList=agent[0:24], LanguageCode=LANGUAGE_CODE)
        round_trip = time.time() - start
        logger.info(
            'End of batch_detect_key_phrases. Took time {:10.4f}\n'.format(
                round_trip))

        agent_key_phrases = parse_detected_key_phrases_response(
            detected_agent_phrase_response)
        logger.debug(json.dumps(key_phrases, indent=4))

        agent_sentiment = comprehend.detect_sentiment(
            Text=agentTranscript[0:5000],
            LanguageCode=LANGUAGE_CODE)['Sentiment']

        print('agent sentiment ' + agent_sentiment)

    customer_entities = {}
    customer_entities_as_list = {}
    customer_key_phrases = ''
    customer_sentiment = ''

    if customer[0] != '':
        logger.info("CUSTOMER " + json.dumps(customer))
        logger.info("CUSTOMER[0:24] " + json.dumps(customer[0:24]))
        detected_agent_entities_response = comprehend.batch_detect_entities(
            TextList=customer[0:24], LanguageCode=LANGUAGE_CODE)
        round_trip = time.time() - start
        logger.info(
            'End of batch_detect_entities. Took time {:10.4f}\n'.format(
                round_trip))

        customer_entities = parse_detected_entities_response(
            detected_agent_entities_response, {})

        for entity_type in customer_entities:
            customer_entities_as_list[entity_type] = list(
                customer_entities[entity_type])

        clean_up_entity_results(agent_entities_as_list)
        print(json.dumps(agent_entities_as_list, indent=4))

        start = time.time()
        detected_agent_phrase_response = comprehend.batch_detect_key_phrases(
            TextList=customer[0:24], LanguageCode=LANGUAGE_CODE)
        round_trip = time.time() - start
        logger.info(
            'End of batch_detect_key_phrases. Took time {:10.4f}\n'.format(
                round_trip))

        customer_key_phrases = parse_detected_key_phrases_response(
            detected_agent_phrase_response)
        logger.debug(json.dumps(key_phrases, indent=4))

        customer_sentiment = comprehend.detect_sentiment(
            Text=customerTranscript[0:5000],
            LanguageCode=LANGUAGE_CODE)['Sentiment']

        print('customer sentiment ' + customer_sentiment)

    doc_to_update = {'transcript': paragraphs}
    doc_to_update['agent'] = agentTranscript
    doc_to_update['customer'] = customerTranscript
    doc_to_update['transcript_entities'] = entities_as_list
    doc_to_update['key_phrases'] = key_phrases
    doc_to_update['agent_key_phrases'] = agent_key_phrases
    doc_to_update['agent_entities'] = agent_entities_as_list
    doc_to_update['customer_phrases'] = customer_key_phrases
    doc_to_update['customer_entities'] = customer_entities_as_list
    doc_to_update['agent_sentiment'] = agent_sentiment
    doc_to_update['customer_sentiment'] = customer_sentiment
    key = 'callrecords/transcript/' + id_generator() + '.json'

    response = s3_client.put_object(Body=json.dumps(doc_to_update, indent=2),
                                    Bucket=bucket,
                                    Key=key)
    logger.info(json.dumps(response, indent=2))

    logger.info("successfully written transcript to s3://" + bucket + "/" +
                key)
    # Return the bucket and key of the transcription / comprehend result.
    transcript_location = {"bucket": bucket, "key": key}
    return transcript_location