Exemple #1
0
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech_v1
    from google.cloud.speech_v1 import enums
    from google.cloud.speech_v1 import types
    import io
    client = speech_v1.SpeechClient()

    # [START speech_python_migration_sync_request]
    # [START speech_python_migration_config]
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='ja-JP')
    # [END speech_python_migration_config]

    # [START speech_python_migration_sync_response]
    response = client.recognize(config, audio)
    # [END speech_python_migration_sync_request]
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
Exemple #2
0
async def speech_to_text(e):
    opts = e.pattern_match.group(1) or ""
    args, _ = parse_arguments(opts, ['lang'])

    lang = args.get('lang', DEFAULT_LANG)
    await e.edit("**Transcribing...**")

    message = await e.get_reply_message()
    file = message.audio or message.voice

    if not file:
        await e.edit("**No audio file specified**", delete_in=3)
        return

    file = await bot.download_file(file)

    content = io.BytesIO(file)
    audio = types.RecognitionAudio(content=file)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.OGG_OPUS,
        sample_rate_hertz=16000,
        language_code=lang)

    response = STTClient.long_running_recognize(config, audio)
    op_result = response.result()
    result = op_result.results[0].alternatives[0]

    output = f"**Transcript:** {result.transcript}\n\n**Confidence:** __{round(result.confidence, 5)}__"
    await e.edit(output)
Exemple #3
0
def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'en-US'  # a BCP-47 language tag

    client = speech.SpeechClient.from_service_account_json(
        "./MyProject-90749589d270.json")
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)

    return user_phrase_result
    def speech2text(self, file_path):
        # Instantiates a client
        client = speech.SpeechClient()

        # The name of the audio file to transcribe
        file_name = file_path

        # Loads the audio into memory
        with io.open(file_name, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)

        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=self.sample_rate,
            language_code=self.lang_code)

        # Detects speech in the audio file
        response = client.recognize(config, audio)

        result_li = []
        for result in response.results:
            result_li.append(format(result.alternatives[0].transcript))

        return result_li
Exemple #5
0
 def start(self, callback):
     """
     Args:
         callback (function): Function that is called when text is transcribed from speech
     """
     try:
         with MicrophoneInput() as mic:
             print("Starting SpeechToTextClient")
             self._mic = mic
             audio_generator = self._mic.generator()
             config = types.RecognitionConfig(
                 encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                 sample_rate_hertz=self._mic.RATE,
                 language_code=self.language_code,
                 use_enhanced=True,
                 speech_contexts=self.speech_context)
             streaming_config = types.StreamingRecognitionConfig(
                 config=config, interim_results=True)
             requests = (types.StreamingRecognizeRequest(
                 audio_content=content) for content in audio_generator)
             responses = self.client.streaming_recognize(
                 streaming_config, requests)
             for response in responses:
                 if not response.results:  # no results
                     continue
                 # first result is best result
                 result = response.results[0]
                 if not result.alternatives:
                     continue
                 transcript = result.alternatives[0].transcript.strip(
                 ).lower()
                 callback((transcript, result.is_final))
     except OutOfRange:
         self.restart(callback)
Exemple #6
0
    def get_config(self):
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code='en-US')

        return config
def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    # The language code you speak.
    language_code = 'th-TH'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    # Initial loop value
    rounds = 1
    while True:
        try:
            print('streaming loop :' + str(rounds))
            with MicrophoneStream(RATE, CHUNK) as stream:
                audio_generator = stream.generator()
                # Create request data
                requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator)
                # POST data to google cloud speech
                responses = client.streaming_recognize(streaming_config, requests)
                # Now, put the transcription responses to use.
                listen_print_loop(responses)
        except Exception as err:
            print(err)
            rounds += 1
def return_recognized(PATH, words):
    SCOPES = ['https://www.googleapis.com/auth/cloud-platform']
    SERVICE_ACCOUNT_FILE = 'C:/Users/Janek/PycharmProjects/test/klucz.json'
    credentials = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    client = speech.SpeechClient(credentials=credentials)

    file_name = os.path.join(os.path.dirname(__file__), 'resources', PATH)

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='pl-PL',
        speech_contexts=[speech.types.SpeechContext(phrases=words)])

    # Detects speech in the audio file
    response = client.recognize(config, audio)
    transcribed = {}

    for result in response.results:
        # print('Transcript: {}, {}'.format(result.alternatives[0].transcript, result.alternatives[0].confidence))
        transcribed[result.alternatives[0].
                    transcript] = result.alternatives[0].confidence

    return transcribed
Exemple #9
0
def transcribe_file(speech_file):
    path_ = pathlib.Path.cwd()
    path_ = path_ / 'teste de fluencia-2b49c4cc975c.json'
    """Transcribe the given audio file."""
    # client = speech_v1.SpeechClient()  # cria o cliente da API do google
    client = speech_v1.SpeechClient.from_service_account_json(path_)

    with io.open(speech_file, 'rb') as audio_file:  # abre o arquivo de audio
        content = audio_file.read()  # ler o conteudo
        audio = types.RecognitionAudio(content=content)  # define o tipo

    config = types.RecognitionConfig(  # configuração a ser usada no reconhecimento
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,
        language_code='pt-BR',
        speech_contexts=[{
            "phrases": utils.list_animals
        }]  # dica para reconhecimento
    )

    response = client.recognize(config, audio)  # reconhecimento do audio

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.

    # for result in response.results:
    #     # The first alternative is the most likely one for this portion.
    #     print(u'Transcript: {}'.format(result.alternatives[0].transcript))

    return response
def transcribe_with_word_time_offsets(
    speech_content: bytes, ) -> Iterable[Tuple[str, float, float]]:
    """Recognize words with time offsets from a speech.

    Args:
        speech_content: Binary data of the speech.

    Yields:
        The word with start time and end time that api recognized.

            [
                ('여기요', 0.0, 2.0),
                ('저기요', 3.6, 5.4),
                ('저', 5.4, 9.2),
                ('밖에서', 9.2, 9.6),
                ('장애인', 9.6, 10.0),
                ('주차', 10.0, 10.3),
                ('가능', 10.3, 10.5),
                ('까만색', 10.5, 11.3),
                ('소나타', 11.3, 11.7),
                ('글', 11.7, 11.8),
                ('찾아요', 11.8, 12.2),
                ('근데요', 12.2, 13.2)
            ]

    See:
        https://cloud.google.com/speech-to-text/docs/sync-recognize

    """
    client = SpeechClient()

    audio = types.RecognitionAudio(content=speech_content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="ko-KR",
        enable_word_time_offsets=True,
    )

    response = client.recognize(config, audio)

    for result in response.results:
        alternative = result.alternatives[0]

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            yield (
                word,
                start_time.seconds + start_time.nanos * 1e-9,
                end_time.seconds + end_time.nanos * 1e-9,
            )
Exemple #11
0
def work1():
    global var, tflag
    var.set("ここに音声認識結果が表示されます")
    language_code = 'ja-JP'  # a BCP-47 language tag
    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        # Now, put the transcription responses to use.
        for response in responses:
            if not response.results:
                continue
            # The `results` list is consecutive. For streaming, we only care about
            # the first result being considered, since once it's `is_final`, it
            # moves on to considering the next utterance.
            result = response.results[0]
            if not result.alternatives:
                continue

            # Display the transcription of the top alternative.
            transcript = result.alternatives[0].transcript
            if not result.is_final:
                txtlist = textwrap.wrap(transcript, int(ww / w))
                print(txtlist)
                setxt = ""
                if (len(txtlist) <= num_comment):
                    for i in range(len(txtlist)):
                        setxt += txtlist[i]
                    var.set(setxt)
                else:
                    for i in range(num_comment):
                        setxt += txtlist[len(txtlist) - num_comment + i]
                    var.set(setxt)

            else:
                # Exit recognition if any of the transcribed phrases could be
                # one of our keywords.
                if re.search(r'\b(exit|quit)\b', transcript, re.I):
                    on_closing()
    def test_inherited_method(self):
        from google.cloud.speech_v1 import types

        client = self._make_one()

        config = types.RecognitionConfig(encoding='FLAC')
        audio = types.RecognitionAudio(uri='http://foo.com/bar.wav')
        with mock.patch.object(client, '_recognize') as recognize:
            client.recognize(config, audio)

            # Assert that the underlying GAPIC method was called as expected.
            recognize.assert_called_once_with(
                types.RecognizeRequest(
                    config=config,
                    audio=audio,
                ), None)
Exemple #13
0
def transcribe_streaming(stream_file):
    """Streams transcription of the given audio file."""

    client = speech.SpeechClient()

    with io.open(stream_file, 'rb') as audio_file:
        content = audio_file.read()

    # In practice, stream should be a generator yielding chunks of audio data.
    stream = [content]
    requests = (types.StreamingRecognizeRequest(audio_content=chunk)
                for chunk in stream)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code='en-US',
        model="command_and_search")
    streaming_config = types.StreamingRecognitionConfig(config=config)

    # streaming_recognize returns a generator.
    responses = client.streaming_recognize(streaming_config, requests)

    return_text = []
    confidence = []

    for response in responses:
        # Once the transcription has settled, the first result will contain the
        # is_final result. The other results will be for subsequent portions of
        # the audio.

        for result in response.results:
            # print('Finished: {}'.format(result.is_final))
            # print('Stability: {}'.format(result.stability))
            alternatives = result.alternatives
            # The alternatives are ordered from most likely to least.
            for alternative in alternatives:
                # print('Confidence: {}'.format(alternative.confidence))
                # print(u'Transcript: {}'.format(alternative.transcript))
                return_text.append(alternative.transcript)
                confidence.append(alternative.confidence)

        confidence = np.mean(confidence)
        return return_text, confidence
    return return_text, confidence
    def test_inherited_method(self):
        from google.cloud.speech_v1 import types

        client = self._make_one()

        config = types.RecognitionConfig(encoding='FLAC')
        audio = types.RecognitionAudio(uri='http://foo.com/bar.wav')
        patch = mock.patch.object(client, '_recognize', autospec=True)
        with patch as recognize:
            client.recognize(config, audio)

            # Assert that the underlying GAPIC method was called as expected.
            assert recognize.call_count == 1
            _, args, _ = recognize.mock_calls[0]
            assert args[0] == types.RecognizeRequest(
                config=config,
                audio=audio,
            )
Exemple #15
0
def transcribe_gcs(gcs_uri):
    """Transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    # [START speech_python_migration_config_gcs]
    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='ja-JP')
    # [END speech_python_migration_config_gcs]

    response = client.recognize(config, audio)
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
Exemple #16
0
def upload():
    return render_template('post.html')
     uri = request.files['audio'].stream.read()
     # #uri = open(stream, "rb", buffering=0)
     #
     # print("Debug")
     # print(request)
     # print(request.form)
     # print(request.files)
     # print("Debug")
     # # text = convert_speech_to_text(audio)
     #
     client = speech_v1.SpeechClient()
     #
     encoding = enums.RecognitionConfig.AudioEncoding.FLAC
     sample_rate_hertz = 48000
     language_code = 'en-US'
     #config = {'encoding': encoding, 'sample_rate_hertz': sample_rate_hertz, 'language_code': language_code}
     config = types.RecognitionConfig(encoding=encoding,
         sample_rate_hertz=sample_rate_hertz,
         language_code=language_code)
     #uri = 'gs://bucket_name/file_name.flac'
     # print(str(uri))
     audio = types.RecognitionAudio(content=uri)
     # print("AUDIO: " + str(audio))
     # print("CONFIG: " + str(config))
     response = client.recognize(config, audio)
     # print("AAAAAA"+str(response))
     # print("BBBBBBB"+str(response.results))
     # print("CCCCCCC"+str(response.results[0].alternatives))
     # print(response.results[0].alternatives[0].transcript)
     #
     # return response.results[0].alternatives[0].transcript
     sample_txt = ""
     x = io.open('sample.txt', mode='r', encoding='utf-8', errors='ignore')
     for line in x:
         sample_txt += line
     print(sample_txt)
     return sample_txt
Exemple #17
0
def run_quickstart():
    client = speech_v1.SpeechClient()
    # The name of the audio file to transcribe
    file_name = '../../sound/sample.wav'
    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=speech_v1.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='ko-KR',
        audio_channel_count=2)

    # Detects speech in the audio file
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=90)

    def closecallback():
        window.destroy()

    window = tkinter.Tk()
    window.title("AI Speaker Test")
    window.geometry("640x400+100+100")
    window.resizable(False, False)
    text = tkinter.Text(window)

    for result in response.results:
        text.insert(tkinter.CURRENT, '음성출력\n')
        if '메시지' in result.alternatives[0].transcript:
            text.insert(tkinter.CURRENT, result.alternatives[0].transcript)
            text.pack()
            button = tkinter.Button(window,
                                    text='Close',
                                    command=closecallback)
            button.place(x=0, y=350, relx=0.5)
            window.mainloop()
            playsound(file_name)
Exemple #18
0
    def listen(self, single_utterance=True):
        speech_contexts = types.SpeechContext(phrases=['male'])
        language_code = 'en-US'

        client = speech.SpeechClient()
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=RATE,
            speech_contexts=[speech_contexts],
            language_code=language_code)
        # If single_utterance=True, the go command uttered by the subject after a long pause for self-admin is not
        # registered
        streaming_config = types.StreamingRecognitionConfig(
            config=config, single_utterance=single_utterance)

        with MicrophoneStream(RATE, CHUNK) as stream:
            audio_generator = stream.generator()
            requests = (types.StreamingRecognizeRequest(audio_content=content)
                        for content in audio_generator)

            responses = client.streaming_recognize(streaming_config, requests)

            # Now, put the transcription responses to use.
            return self.listen_print_loop(responses)
def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'en-GB'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        try:
            responses = client.streaming_recognize(streaming_config,
                                                   requests,
                                                   timeout=21)

        except:
            noresult = ("no result")
            return noresult
        num_chars_printed = 0
        for response in responses:
            try:
                if not response.results:
                    print("no result ")
                    continue
                # The `results` list is consecutive. For streaming, we only care about
                # the first result being considered, since once it's `is_final`, it
                # moves on to considering the next utterance.
                result = response.results[0]

                if not result.alternatives:
                    print("no alternatives")
                    continue

                # Display the transcription of the top alternative.
                transcript = result.alternatives[0].transcript

                # Display interim results, but with a carriage return at the end of the
                # line, so subsequent lines will overwrite them.
                #
                # If the previous result was longer than this one, we need to print
                # some extra spaces to overwrite the previous result
                overwrite_chars = ' ' * (num_chars_printed - len(transcript))

                if not result.is_final:
                    sys.stdout.write(transcript + overwrite_chars + '\r')
                    sys.stdout.flush()
                    print("loop no result")
                    return transcript + overwrite_chars

                num_chars_printed = 0
            except:
                noresult = ''
                return noresult
from stream_recognition_class import StreamRecognition
import time

start = time.time()
lan_code = sys.argv[1]
translator_code = sys.argv[2]
RATE = 16000
CHUNK = int(RATE / 10)

credentials = service_account.Credentials.from_service_account_file(
    'Location of API Key File')

data = []
client = speech.SpeechClient(credentials=credentials)
config = types.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=RATE,
    language_code=lan_code)
streaming_config = types.StreamingRecognitionConfig(config=config,
                                                    interim_results=True)

with StreamRecognition(RATE, CHUNK) as stream:
    audio_generator = stream.speech_generator()
    requests = (types.StreamingRecognizeRequest(audio_content=content)
                for content in audio_generator)

    responses = client.streaming_recognize(streaming_config, requests)

    end = (time.time() - start) - 10
    formatted_time = "{:.2f}".format(end)
    while True:
        fetched_text = strem_recognition_module.print_speech_loop(
Exemple #21
0
import io
from google.cloud import speech_v1
from google.cloud.speech_v1 import enums, types

client = speech_v1.SpeechClient()
audio_file = io.open("output.mp3", 'rb')
content = audio_file.read()

audio = types.RecognitionAudio(content=content)

config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=16000,
    language_code='en_US')

response = client.recognize(config, audio)
print(response)