Exemple #1
0
def transcribe_wav_file(speech_file):
    '''Transcribe the given audio file.'''

    client = speech.SpeechClient()

    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='en-US',
        audio_channel_count=2,
        enable_separate_recognition_per_channel=False,
        max_alternatives=1,
        enable_word_time_offsets=True)

    response = client.recognize(config=config, audio=audio)
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    last_word = None
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        # wait length between results
        first_word = result.alternatives[0].words[0].start_time
        if last_word is not None:
            td = first_word - last_word
            time.sleep(td.seconds)
        last_word = result.alternatives[0].words[0].end_time
        yield result.alternatives[0].transcript
Exemple #2
0
def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = "en-US"  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code
    )

    streaming_config = speech.StreamingRecognitionConfig(
        config=config, interim_results=True
    )

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (
            speech.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator
        )

        responses = client.streaming_recognize(streaming_config, requests)
        start_time = time.time()
        wait_time = 2
        # Now, put the transcription responses to use.
        listen_print_loop(responses)
Exemple #3
0
def run_quickstart():
    # [START speech_quickstart]
    import io
    import os

    # Imports the Google Cloud client library
    # [START speech_python_migration_imports]
    from google.cloud import speech

    # [END speech_python_migration_imports]

    # Instantiates a client
    # [START speech_python_migration_client]
    client = speech.SpeechClient()
    # [END speech_python_migration_client]

    # The name of the audio file to transcribe
    file_name = os.path.join(os.path.dirname(__file__), ".", "file.wav")

    # Loads the audio into memory
    with io.open(file_name, "rb") as audio_file:
        content = audio_file.read()
        audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="ko-KR",
    )

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)

    for result in response.results:
        print("Transcript: {}".format(result.alternatives[0].transcript))
def get_transcript_long(content: bytes = None, audio_path: str = None):
    """
    Gets transcript of long audio file asynchonously.

    Args:
        content (bytes): Content of audio file as bytes.
        audio_path (str): Path or uri to audio file.

    Returns:
        object: Processed audio file for speech-to-text.
    """
    if content is None and audio_path is None:
        raise ValueError('At least one parameter cannot be None.')

    audio = speech.RecognitionAudio(uri=audio_path) if content is None else speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    operation = client.long_running_recognize(config=config, audio=audio)
    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))

    return response
Exemple #5
0
def audio_to_text(filename):
    # [START speech_quickstart]
    import io

    # Imports the Google Cloud client library
    # [START migration_import]
    from google.cloud import speech
    # [END migration_import]

    # Instantiates a client
    # [START migration_client]
    client = speech.SpeechClient()
    # [END migration_client]

    # The name of the audio file to transcribe
    file_name = os.path.join(os.path.dirname(__file__), '.', 'uploads',
                             filename)

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:

        content = audio_file.read()
        audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='en-US',
        audio_channel_count=2,
        enable_separate_recognition_per_channel=True)

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)

    return response.results[0].alternatives[0].transcript
def transcribe_file_with_word_time_offsets(speech_file):
    """Transcribe the given audio file synchronously and output the word time
    offsets."""
    from google.cloud import speech

    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_word_time_offsets=True,
    )

    response = client.recognize(request={"config": config, "audio": audio})

    for result in response.results:
        alternative = result.alternatives[0]
        print("Transcript: {}".format(alternative.transcript))

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            print(
                f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}"
            )
    def create_speech_config():

        phrases = []

        if (len(confvars.G_PHRASES_PATH) != 0):
            with open(confvars.G_PHRASES_PATH,
                      "r",
                      encoding=confvars.G_PHRASES_ENCODING) as fp:
                for line in fp:
                    if line:
                        phrases.append(line.strip().encode(
                            'ascii', 'ignore').decode('ascii'))
        else:
            glbl.main_logger.info(
                f"Phrases file {confvars.G_PHRASES_PATH} is null.")

        glbl.main_logger.info(f"phrases as context, num={len(phrases)}")

        speech_context = speech.SpeechContext(
            phrases=phrases[:confvars.G_MAX_PHRASES])

        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=confvars.G_AUD_SAMPLING_RATE,
            enable_word_time_offsets=False,
            model='video',
            profanity_filter=True,
            enable_automatic_punctuation=True,
            speech_contexts=[speech_context],
            language_code=confvars.G_LANGUAGE_CODE)

        speech_config = speech.StreamingRecognitionConfig(config=config,
                                                          interim_results=True)

        return speech_config
def transcribe_gcs(gcs_uri, lang, creds):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""

    #client = speech.SpeechClient()
    client = speech.SpeechClient.from_service_account_json(creds)

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        #encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code=lang,
        enable_word_time_offsets=True,
        enable_automatic_punctuation=True,
    )

    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=250)

    data = proto_message_to_dict(response)
    with open('response.json', 'w') as f:
        json.dump(data, f)

    with open('response.json', 'r') as f:
        data = json.load(f)

    onewordSRT('abhi_oneword.srt', data)
    youtubeStyleSRT('abhi_yt.srt', data)
Exemple #9
0
def transcribe_file(speech_file, file_name):
    """Transcribe the given audio file."""

    client = speech.SpeechClient.from_service_account_json('key.json')

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        audio_channel_count=2,
        language_code="en-US",
    )

    response = client.recognize(config=config, audio=audio)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    if response:
        print("RES", response)
        text = ""
        confidence = 0
        for result in response.results:
            text += result.alternatives[0].transcript + " "
            confidence = result.alternatives[0].confidence
        return (text, confidence)
    else:
        return ("Could not create lyrics..", 0)
Exemple #10
0
def transcribe_file():
    from google.cloud import speech
    import io
    client = speech.SpeechClient()

    with io.open('proken.wav', 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='ja-JP',
        audio_channel_count=1,
        enable_separate_recognition_per_channel=True)

    operation = client.long_running_recognize(request={
        "config": config,
        "audio": audio
    })
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=90)

    with io.open("proken.txt", "w", encoding="utf-8") as f:
        for result in response.results:
            f.write(u'Transcript: {}'.format(
                result.alternatives[0].transcript))
Exemple #11
0
def transcribe_gcs_with_multichannel(gcs_uri):
    """Transcribe the given audio file on GCS with
    multi channel."""
    # [START speech_transcribe_multichannel_gcs]
    from google.cloud import speech

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="en-US",
        audio_channel_count=2,
        enable_separate_recognition_per_channel=True,
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))
        print(u"Channel Tag: {}".format(result.channel_tag))
def run_quickstart():
    # [START speech_quickstart]

    # Imports the Google Cloud client library
    # [START speech_python_migration_imports]
    from google.cloud import speech

    # [END speech_python_migration_imports]

    # Instantiates a client
    # [START speech_python_migration_client]
    client = speech.SpeechClient()
    # [END speech_python_migration_client]

    # The name of the audio file to transcribe
    gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)

    for result in response.results:
        print("Transcript: {}".format(result.alternatives[0].transcript))
Exemple #13
0
def transcribe_file(speech_file):
    from google.cloud import speech
    import io

    if ".mp3" in speech_file:
        sound = AudioSegment.from_mp3(speech_file)
        sound = sound.set_channels(1)
        sound.export(speech_file[:-4] + ".wav", format="wav")
        speech_file = speech_file[:-4] + ".wav"

    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code="en-US")

    response = client.recognize(config=config, audio=audio)

    response_string = ""
    for result in response.results:
        response_string += result.alternatives[0].transcript

    # print(response_string)

    # question_obj = find_question(response_string)
    # if question_obj != None:
    #     response_string = question_obj['question']

    return response_string
def transcribe_model_selection_gcs(gcs_uri, model):
    """Transcribe the given audio file asynchronously with
    the selected model."""
    from google.cloud import speech

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        model=model,
    )

    operation = client.long_running_recognize(request={
        "config": config,
        "audio": audio
    })

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))
def transcribe_file(speech_file):
    """Transcribe the given audio file asynchronously."""
    from google.cloud import speech

    client = speech.SpeechClient()

    # [START speech_python_migration_async_request]
    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()
    """
     Note that transcription is limited to a 60 seconds audio file.
     Use a GCS file for audio longer than 1 minute.
    """
    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
    )

    # [START speech_python_migration_async_response]

    operation = client.long_running_recognize(config=config, audio=audio)
    # [END speech_python_migration_async_request]

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))
Exemple #16
0
def main():
    language_code = "uk-UA"

    client = speech.SpeechClient()

    interaction_type = speech.RecognitionMetadata.InteractionType.DICTATION

    metadata = speech.RecognitionMetadata(interaction_type=interaction_type)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        metadata=metadata,
    )

    streaming_config = speech.StreamingRecognitionConfig(config=config,
                                                         interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (speech.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        listen_print_loop(responses)
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech
    import io

    f = open('transcribe.txt', "a")
    client = speech.SpeechClient()

    # [START speech_python_migration_sync_request]
    # [START speech_python_migration_config]
    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=SAMPLE_RATE,
        language_code="en-US",
    )
    # [END speech_python_migration_config]

    # [START speech_python_migration_sync_response]
    response = client.recognize(config=config, audio=audio)

    # [END speech_python_migration_sync_request]
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        #print(u"Transcript: {}".format(result.alternatives[0].transcript))
        print(result.alternatives[0].transcript)
        f.write(result.alternatives[0].transcript)
    # [END speech_python_migration_sync_response]
    f.write("\n\n")
    f.close()
Exemple #18
0
    def download_audio_and_transcribe(self, recording_url: str) -> str:
        transcription: str = ""
        self.connect(destination="speech")
        response = requests.get(url=recording_url, stream=True)

        reqs = (speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in response.iter_content())
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=8000,
            language_code="en-US",
        )
        streaming_config = speech.StreamingRecognitionConfig(config=config)

        responses = self.speech_client.streaming_recognize(config=streaming_config, requests=reqs,)

        for response in responses:
            # Once the transcription has settled, the first result will contain the
            # is_final result. The other results will be for subsequent portions of
            # the audio.
            for result in response.results:
                # print("Finished: {}".format(result.is_final))
                # print("Stability: {}".format(result.stability))
                alternatives = result.alternatives
                # The alternatives are ordered from most likely to least.
                for alternative in alternatives:
                    # print("Confidence: {}".format(alternative.confidence))
                    transcription = u"{}".format(alternative.transcript)

        return transcription
def transcribe_gcs_with_word_time_offsets(gcs_uri):
    """Transcribe the given audio file asynchronously and output the word time
    offsets."""
    from google.cloud import speech

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_word_time_offsets=True,
    )

    operation = client.long_running_recognize(request={
        "config": config,
        "audio": audio
    })

    print("Waiting for operation to complete...")
    result = operation.result(timeout=90)

    for result in result.results:
        alternative = result.alternatives[0]
        print("Transcript: {}".format(alternative.transcript))
        print("Confidence: {}".format(alternative.confidence))

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            print(
                f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}"
            )
Exemple #20
0
def speech2text():
    client = speech.SpeechClient()

    file_name = "output.wav"
    with io.open(file_name, "rb") as audio_file:
        content = audio_file.read()
        audio = speech.RecognitionAudio(content = content)

    config = speech.RecognitionConfig(
        audio_channel_count = 2,
        sample_rate_hertz = 44100,
        enable_word_time_offsets = True,
        language_code = "en-US",
    )

    response = client.recognize(request={"config": config, "audio": audio})
    ret = []
    for res in response.results:
        for words in res.alternatives[0].words:
            ret.append([str(words.word), int(words.end_time.seconds) + float(words.end_time.microseconds/1000000)])

    tr = []
    for res in response.results:
        tr.append(res.alternatives[0].transcript.strip())
    return tr, ret
def get_transcript(content: bytes = None, audio_path: str = None):
    """
    Gets transcript of audio file.

    Args:
        content (bytes): Content of audio file as bytes.
        audio_path (str): Path or uri to audio file.

    Returns:
        object: Processed audio file for speech-to-text.
    """
    if content is None and audio_path is None:
        raise ValueError("At least one parameter cannot be None.")

    audio = (
        speech.RecognitionAudio(uri=audio_path)
        if content is None
        else speech.RecognitionAudio(content=content)
    )
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)
    for result in response.results:
        print(f"Transcript: {result.alternatives[0].transcript}")

    return response
def transcribe_file_with_enhanced_model(path):
    """Transcribe the given audio file using an enhanced model."""
    # [START speech_transcribe_enhanced_model]
    import io

    from google.cloud import speech

    client = speech.SpeechClient()

    # path = 'resources/commercial_mono.wav'
    with io.open(path, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        # Enhanced models are only available to projects that
        # opt in for audio data collection.
        use_enhanced=True,
        # A model must be specified to use enhanced model.
        model="phone_call",
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print("Transcript: {}".format(alternative.transcript))
def transcribe_model_selection(speech_file, model):
    """Transcribe the given audio file synchronously with
    the selected model."""
    from google.cloud import speech

    client = speech.SpeechClient()

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,
        language_code="en-US",
        model=model,
    )

    response = client.recognize(config=config, audio=audio)

    output = ""
    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        output += ("-" * 20 + "\n")
        print("First alternative of result {}".format(i))
        output += "First alternative of result {}\n".format(i)
        print(u"Transcript: {}".format(alternative.transcript))
        output += u"Transcript: {}\n".format(alternative.transcript)

    with open(speech_file + ".txt", "w") as f:
        f.write(output)
Exemple #24
0
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    operation = client.long_running_recognize(request={
        "config": config,
        "audio": audio
    })

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))
Exemple #25
0
def transcribe_context_classes(storage_uri):
    """Provides "hints" to the speech recognizer to
    favor specific classes of words in the results."""
    # [START speech_context_classes]
    from google.cloud import speech

    client = speech.SpeechClient()

    # storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
    audio = speech.RecognitionAudio(uri=storage_uri)

    # SpeechContext: to configure your speech_context see:
    # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
    # Full list of supported phrases (class tokens) here:
    # https://cloud.google.com/speech-to-text/docs/class-tokens
    speech_context = speech.SpeechContext(phrases=["$TIME"])

    # RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
    # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        speech_contexts=[speech_context],
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print("Transcript: {}".format(alternative.transcript))
    def __init__(
        self,
        language: str,
        credentials: Union[None, str, dict] = None,
        sample_rate: int = 16000,
        **kwargs,
    ) -> None:
        if credentials:
            if isinstance(credentials, str):
                credentials = service_account.Credentials.from_service_account_file(
                    credentials)
            elif isinstance(credentials, dict):
                credentials = service_account.Credentials.from_service_account_info(
                    credentials)
            else:
                raise ValueError(
                    "Invalid Credentials: Only dict, str, or None accepted")

        self._client = speech.SpeechClient(credentials=credentials)
        self._config = speech.StreamingRecognitionConfig(
            config=speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=sample_rate,
                language_code=language,
                enable_automatic_punctuation=True,
            ),
            interim_results=True,
        )
        self._queue: Queue = Queue()
        self._thread: Any = None
Exemple #27
0
def transcribe(gs_prefix, vod):
    '''
    This is an async call to google clouds long running speech recognize
    '''

    gcs_uri = gs_prefix + vod
    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
        sample_rate_hertz=44100,
        language_code="en-US",
    )

    operation = client.long_running_recognize(config=config, audio=audio)
    print('Running transcription for vod:', vod)
    print("Waiting for operation to complete...")
    response = operation.result(timeout=3600)
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    with open(
            r"D:\\Users\\Brad\\Graduate_School\\2021\\data606\\data\\transcripts\\"
            + vod + '.txt', 'w') as openfile:

        for result in response.results:
            # The first alternative is the most likely one for this portion.
            transcript = result.alternatives[0].transcript
            print(u"Transcript: {}".format(transcript))
            openfile.writelines(transcript)
Exemple #28
0
def speech_to_text():
    try:
        # move to config file?
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = app.config['GOOGLE_KEY']

        try:
            file = request.files['audio_data']
            content = file.read()

            client = speech.SpeechClient()

            audio = speech.RecognitionAudio(content = content)
            config = speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                language_code="en-US",
            )

            response = client.recognize(config = config, audio = audio)

            for result in response.results:
                result = result.alternatives[0].transcript
                print("Transcript: {}".format(result))
                return result

            return "ERROR: Google failed to transcribe!"

        except Exception as err:
            print("Failed to transcribe audio:")
            print(err)

    except Exception as err:
        print("Failed to get google api credentials:")
        print(err)
Exemple #29
0
def main():

    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'ko-KR'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = speech.StreamingRecognitionConfig(
        config=config,
        #  single_utterance=True 파라미터 추가함 --> single spoken utterance만 인지해서 응답해줌
        # 중간에 말을 멈추거나 하면 스트리밍인식을 종료함 --> 스피커소리 다시 인식 안하게됨
        #single_utterance=True,
        # false로 바꿧어. 이렇게 바꾸면 is_final 이 true인것만 반환함
        interim_results=True)


    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (speech.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # listen_print_loop가 리턴해도 다시 실핼될 수 있도
        listen_print_loop(responses)
        print('main: finished listen_print_loop')
Exemple #30
0
def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'ja-JP'  # a BCP-47 language tag
    import os
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "speech-rec-827143ff9a4c.json"
    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = speech.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    print("start rec")
    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (speech.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
       
       
        for s in listen_print_loop(responses):
                # voiceroid.say(s)
                print(s)