def get_transcript_long(content: bytes = None, audio_path: str = None):
    """
    Gets transcript of long audio file asynchonously.

    Args:
        content (bytes): Content of audio file as bytes.
        audio_path (str): Path or uri to audio file.

    Returns:
        object: Processed audio file for speech-to-text.
    """
    if content is None and audio_path is None:
        raise ValueError('At least one parameter cannot be None.')

    audio = speech.RecognitionAudio(uri=audio_path) if content is None else speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    operation = client.long_running_recognize(config=config, audio=audio)
    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))

    return response
def get_transcript(content: bytes = None, audio_path: str = None):
    """
    Gets transcript of audio file.

    Args:
        content (bytes): Content of audio file as bytes.
        audio_path (str): Path or uri to audio file.

    Returns:
        object: Processed audio file for speech-to-text.
    """
    if content is None and audio_path is None:
        raise ValueError("At least one parameter cannot be None.")

    audio = (
        speech.RecognitionAudio(uri=audio_path)
        if content is None
        else speech.RecognitionAudio(content=content)
    )
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)
    for result in response.results:
        print(f"Transcript: {result.alternatives[0].transcript}")

    return response
Example #3
0
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=44100,
        audio_channel_count=2,
        language_code="ja-JP",
    )

    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=300)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech
    import io

    client = speech.SpeechClient()

    # [START speech_python_migration_sync_request]
    # [START speech_python_migration_config]
    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=44100,
        language_code="en-GB",
        audio_channel_count=2,
    )
    # [END speech_python_migration_config]

    # [START speech_python_migration_sync_response]
    response = client.recognize(config=config, audio=audio)

    # [END speech_python_migration_sync_request]
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
Example #5
0
def speech_to_text(gcs_uri):

    # Instantiates a client
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    #gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)
    text_all = ""
    for result in response.results:
        text_all += result.alternatives[0].transcript
        print("Transcript: {}".format(result.alternatives[0].transcript))
    print(text_all)
    #text_response = (response.results.alternatives[0].transcript)
    #print(text_response)
    return(text_all)
Example #6
0
def speechToText(extention):
    # Instantiates a client
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    file_name = os.path.join(os.path.dirname(__file__), "resources", "exemple."+extention)

    # Loads the audio into memory
    with io.open(file_name, "rb") as audio_file:
        content = audio_file.read()
        audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)

    for result in response.results:
        resultat = "{}".format(result.alternatives[0].transcript)


    return resultat
def generate_transcript(language_code="ro-RO"):

    # Creates google client
    client = speech.SpeechClient()

    # Full path of the audio file, Replace with your file name
    file_name = os.path.join(os.path.dirname(__file__), "cache/recording.wav")
    wav_file = wave.Wave_read(file_name)
    ch = wav_file.getnchannels()

    #Loads the audio file into memory
    with io.open(file_name, "rb") as audio_file:
        content = audio_file.read()
        audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        audio_channel_count=ch,
        language_code=language_code,
    )

    # Sends the request to google to transcribe the audio
    response = client.recognize(request={"config": config, "audio": audio})

    return response
Example #8
0
def transcribe_file(speech_file):
    # """Transcribe the given audio file asynchronously."""
    print("Before Imports")
    from google.cloud import speech

    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    # [START speech_python_migration_async_response
    operation = client.long_running_recognize(request={
        "config": config,
        "audio": audio
    })

    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))
Example #9
0
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    client = speech.SpeechClient()

    # with io.open(speech_file, "rb") as audio_file:
    #     content = audio_file.read()

    # languages = ["en-US", "ru-RU", "uk-UA"]

    audio = speech.RecognitionAudio(content=speech_file.read())

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.
        ENCODING_UNSPECIFIED,  # for .mp3 format
        sample_rate_hertz=16000,
        language_code="uk-UA",
    )

    response = client.recognize(config=config, audio=audio)
    if response:
        # Each result is for a consecutive portion of the audio. Iterate through
        # them to get the transcripts for the entire audio file.
        for result in response.results:
            # The first alternative is the most likely one for this portion.
            logger.info(u"Transcript: {}".format(
                result.alternatives[0].transcript))
        return response.results[0].alternatives[0].transcript
    def run(self, file_name='default'):

        client = speech.SpeechClient()
        file_names = file_name
        ip = file_name.split('_')[0]
        file_name = os.path.join(os.path.dirname(__file__), "payload",
                                 file_name)

        # Loads the audio into memory
        with io.open(file_name, "rb") as audio_file:
            content = audio_file.read()
            audio = speech.RecognitionAudio(content=content)

        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.MULAW,
            sample_rate_hertz=8000,
            audio_channel_count=1,
            language_code="ko-KR",
            enable_automatic_punctuation=True)

        # Detects speech in the audio file
        response = client.recognize(config=config, audio=audio)

        text = ""

        for idx, result in enumerate(response.results):
            alternative = result.alternatives[0]
            print("-" * 20)
            print("Transcript: {}".format(alternative.transcript))
            text += alternative.transcript
        # [END speech_quickstart]

        return ip, text
Example #11
0
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech
    import io

    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        # sample_rate_hertz=16000,
        language_code="en-US",
        audio_channel_count=2)

    response = client.recognize(config=config, audio=audio)
    transcription = []
    # print(response)
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        transcription.append(result.alternatives[0].transcript)
    return transcription


# transcribe_file("recording/509648114_data/1-ColorfulPockets_0617.ogg")
Example #12
0
def transcribe_file_with_multichannel(speech_file):
    """Transcribe the given audio file synchronously with
      multi channel."""
    # [START speech_transcribe_multichannel]
    from google.cloud import speech

    client = speech.SpeechClient()

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="en-US",
        audio_channel_count=2,
        enable_separate_recognition_per_channel=True,
    )

    response = client.recognize(request={"config": config, "audio": audio})

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))
        print(u"Channel Tag: {}".format(result.channel_tag))
Example #13
0
def stt(request):
    credential_path = '..//sa-spoiler-4897b3e764af.json'
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credential_path

    f = open("chatbot/file.ogg", 'wb')
    f.write(request.body)
    f.close()
    with io.open("chatbot/file.ogg", "rb") as audio_file:
        content = audio_file.read()

    try:
        client = speech.SpeechClient()
        audio = speech.RecognitionAudio(content=content)
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
            sample_rate_hertz=48000,
            language_code="ko-KR",
        )

        response = client.recognize(config=config, audio=audio)

        stringList = []
        for result in response.results:
            stringList.append(result.alternatives[0].transcript)
        resultStr = ''.join(stringList)

    except DefaultCredentialsError:
        logging.warning('DefaultCredentaials error. check api key')
        resultStr = "stt 오류입니다. 관리자에게 문의하세요 (DefaultCredentalsError)"
    else:
        resultStr = "undefined error. 관리자에게 문의하세요"
    finally:
        return HttpResponse(resultStr)
Example #14
0
    def __call__(self, data):
        try:
            for _ in range(self.retries):

                audio = speech.RecognitionAudio(content=data)

                config = speech.RecognitionConfig(
                    encoding=speech.RecognitionConfig.AudioEncoding.
                    ENCODING_UNSPECIFIED,
                    sample_rate_hertz=self.rate,
                    language_code=self.language,
                    model=self.model,
                )

                response = self.client.recognize(config=config, audio=audio)

                # return
                for result in response.results:
                    try:
                        return result.alternatives[0].transcript
                    except IndexError:
                        # no result
                        continue
                    except JSONDecodeError:
                        continue

        except KeyboardInterrupt:
            return None
Example #15
0
def sync_recognize_with_multi_region_gcs():

    # [START speech_multi_region]

    # Imports the Google Cloud client library
    from google.cloud import speech
    from google.api_core import client_options

    # Instantiates a client

    # [START speech_multi_region_client]

    # Pass an additional argument, ClientOptions, to specify the new endpoint.
    client_options = client_options.ClientOptions(
        api_endpoint="eu-speech.googleapis.com")

    client = speech.SpeechClient(client_options=client_options)
    # [END speech_multi_region_client]

    # The name of the audio file to transcribe
    gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)

    for result in response.results:
        print("Transcript: {}".format(result.alternatives[0].transcript))
Example #16
0
def transcribe_file(content, channels=2):
    """
    Transcribe the given audio file.
    """

    audio = speech.RecognitionAudio(content=content.read())
    config = speech.RecognitionConfig(
        language_code="en-US",
        enable_word_time_offsets=True,
        enable_automatic_punctuation=True,
        # sample_rate_hertz=sample_rate,
        audio_channel_count=channels)

    response = client.recognize(request={"config": config, "audio": audio})

    res = []
    for result in response.results:
        alternative = result.alternatives[0]
        transcript = alternative.transcript
        confidence = alternative.confidence

        words = []

        for word_info in alternative.words:
            word = word_info.word
            start = word_info.start_time.total_seconds()
            end = word_info.end_time.total_seconds()

            words.append(SpeechWord(word, start, end))

        res.append(SpeechParagraph(transcript, confidence, words))

    return res
Example #17
0
    def transcribe(self):
        # Full path of the audio file, Replace with your file name
        file_name = "data/recording.wav"

        #Loads the audio file into memory
        with io.open(file_name, "rb") as file:
            content = file.read()
            audio = speech.RecognitionAudio(content=content)

        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            audio_channel_count=2,
            language_code="en-US",
            sample_rate_hertz=16000,
        )

        # Sends the request to google to transcribe the audio
        response = self.client.recognize(request={
            "config": config,
            "audio": audio
        })

        self.numChars = 20

        self.calcLines()
        self.generateResults()

        return self.results, self.numLines
Example #18
0
def transcribe_context_classes(storage_uri):
    """Provides "hints" to the speech recognizer to
    favor specific classes of words in the results."""
    # [START speech_context_classes]
    from google.cloud import speech

    client = speech.SpeechClient()

    # storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
    audio = speech.RecognitionAudio(uri=storage_uri)

    # SpeechContext: to configure your speech_context see:
    # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
    # Full list of supported phrases (class tokens) here:
    # https://cloud.google.com/speech-to-text/docs/class-tokens
    speech_context = speech.SpeechContext(phrases=['$TIME'])

    # RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
    # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        speech_contexts=[speech_context],
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print("Transcript: {}".format(alternative.transcript))
def transcribe_file_with_auto_punctuation(path):
    """Transcribe the given audio file with auto punctuation enabled."""
    # [START speech_transcribe_auto_punctuation]
    from google.cloud import speech

    client = speech.SpeechClient()

    # path = 'resources/commercial_mono.wav'
    with io.open(path, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        # Enable automatic punctuation
        enable_automatic_punctuation=True,
    )

    response = client.recognize(request={"config": config, "audio": audio})

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print("Transcript: {}".format(alternative.transcript))
Example #20
0
def transcribe_gcs_with_multichannel(gcs_uri):
    """Transcribe the given audio file on GCS with
      multi channel."""
    # [START speech_transcribe_multichannel_gcs]
    from google.cloud import speech

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="en-US",
        audio_channel_count=2,
        enable_separate_recognition_per_channel=True,
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))
        print(u"Channel Tag: {}".format(result.channel_tag))
Example #21
0
def run_quickstart():
    # [START speech_quickstart]
    import io
    import os

    # Imports the Google Cloud client library
    # [START migration_import]
    from google.cloud import speech
    # [END migration_import]

    # Instantiates a client
    # [START migration_client]
    client = speech.SpeechClient()
    # [END migration_client]

    # The name of the audio file to transcribe
    file_name = os.path.join(
        os.path.dirname(__file__),
        'kor_audio.wav')
    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        audio_channel_count=2,
        language_code='ko-KR')

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)

    for result in response.results:
        print('Transcript: {}'.format(result.alternatives[0].transcript))
Example #22
0
async def speech2text(content: bytes,
                      language_code: str = 'ru-RU',
                      channels: int = 1,
                      rate: int = 16000,
                      encoding: str = 'LINEAR16') -> str:
    global CLIENT
    hash_file: str = hashlib.sha512(content).hexdigest()
    cache_key: str = f'speech2text_{hash_file}_{language_code}_{channels}_{rate}_{encoding}'

    cached_response: str = await redis.get(key=cache_key)
    if cached_response:
        return cached_response

    audio = speech.RecognitionAudio(content=content)
    try:
        encoding = speech.RecognitionConfig.AudioEncoding[encoding.upper()]
    except KeyError:
        raise

    config = speech.RecognitionConfig(
        encoding=encoding,
        sample_rate_hertz=rate,
        language_code=language_code,
        audio_channel_count=channels,
    )

    response = await CLIENT.recognize(config=config, audio=audio)

    await redis.setex(key=cache_key,
                      timeout=TTL_CACHE,
                      value=response.results[0].alternatives[0].transcript)

    return response.results[0].alternatives[0].transcript
Example #23
0
def run_quickstart():
    # [START speech_quickstart]

    # Imports the Google Cloud client library
    # [START speech_python_migration_imports]
    from google.cloud import speech

    # [END speech_python_migration_imports]

    # Instantiates a client
    # [START speech_python_migration_client]
    client = speech.SpeechClient()
    # [END speech_python_migration_client]

    # The name of the audio file to transcribe
    gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)

    for result in response.results:
        print("Transcript: {}".format(result.alternatives[0].transcript))
Example #24
0
def submit_speech_api_request(file: bytes, language_code: str) -> dict:
    """
    Function to submit a speech-to-text API request.

    :return: transcripts extracted from the speech file, in json format
    """

    # FUTURE: Avoid scoping lifetime of the client to function?
    client = speech.SpeechClient()

    # Source: https://cloud.google.com/speech-to-text/docs/sync-recognize
    # Did not dig into all the API options, as the assignment
    # was to develop a small wrapper around the actual Speech-to-Text API.
    audio = speech.RecognitionAudio(content=file)

    # TODO Extract 'audio_channel_count' from the input file itself
    # https://stackoverflow.com/questions/47905083/how-to-check-number-of-channels-in-my-audio-wav-file-using-ffmpeg-command
    config = speech.RecognitionConfig(audio_channel_count=2, language_code=language_code)
    response = client.recognize(config=config, audio=audio)

    # FUTURE: Additional error handling may be appropriate.

    # Format JSON output
    return {
        # Each result is for a consecutive portion of the audio. Iterate through
        # them to get the transcripts for the entire audio file.
        # The first alternative is the most likely one for this portion.
        #
        # FUTURE: Check if alternatives[0] always exists
        "transcripts": [f"{result.alternatives[0].transcript}" for result in response.results]
    }
Example #25
0
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech
    import os
    import io
    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,
        language_code="en-US",
        enable_automatic_punctuation = True,
    )

    response = client.recognize(config=config, audio=audio)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    v = ""

    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        
        v += result.alternatives[0].transcript

    return v
Example #26
0
def transcribe_file():
    from google.cloud import speech
    import io
    client = speech.SpeechClient()

    with io.open('proken.wav', 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='ja-JP',
        #audio_channel_count=2,
        enable_separate_recognition_per_channel=True
    )

    operation = client.long_running_recognize(
        request={"config": config, "audio": audio}
    )
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=90)

    with io.open("proken.txt", "w", encoding="utf-8") as f:
        for result in response.results:
            f.write(u'{}'.format(result.alternatives[0].transcript))
Example #27
0
    def __init__(self):

        # Instantiates a client
        client = speech.SpeechClient()

        # The name of the audio file to transcribe
        gcs_uri = "gs://gradclip1-audio/small.flac"

        audio = speech.RecognitionAudio(uri=gcs_uri)

        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
            language_code="en-AU",
            audio_channel_count = 2,
            enable_word_time_offsets = True
        )

        # Detects speech in the audio file
        operation = client.long_running_recognize(config=config, audio=audio)
        print("Waiting for operation to complete...")
        response = operation.result(timeout=9000)
        for result in response.results:
            for word in result.alternatives[0].words:
                start_time = word.start_time.microseconds
                end_time = word.end_time.microseconds
Example #28
0
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        # encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="en-US",
        enable_automatic_punctuation=True,
    )

    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=100000)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    transcript = ""
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        transcript += " " + result.alternatives[0].transcript
        print("Confidence: {}".format(result.alternatives[0].confidence))
    return transcript
def transcribe_audio(speech_file):

    retval = ""

    try:
        transcription_client = speech.SpeechClient.from_service_account_file(
            my_credentials_file_path)
    except:
        logger.info("Not using Google speech to text as credz file not at : " +
                    my_credentials_file_path)
        return ""

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-GB",
    )

    response = transcription_client.recognize(config=config, audio=audio)

    for result in response.results:
        retval = result.alternatives[0].transcript

    return retval
Example #30
0
def get_transcript(uri):
    """
    Get transcript for a audio WAVE file stored in GCS bucket

    @param: uri - File location in the google cloud storage bucket
    """
    # create speech client
    client = speech.SpeechClient()

    # actual audio file to recognize
    audio = speech.RecognitionAudio(uri=uri)
    # recognition config
    config = speech.RecognitionConfig(language_code="en-US")

    # execute the recognition API
    operation = client.long_running_recognize(config=config, audio=audio)
    print("Waiting for operation to complete...")
    # wait for 90 seconds max
    response = operation.result(timeout=90)

    confidence = []
    transcript = ""
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        confidence.append(result.alternatives[0].confidence)
        transcript = transcript + result.alternatives[0].transcript
    confidence = sum(confidence) / len(confidence)

    return transcript, confidence