def speech2text(self, file_path):
        # Instantiates a client
        client = speech.SpeechClient()

        # The name of the audio file to transcribe
        file_name = file_path

        # Loads the audio into memory
        with io.open(file_name, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)

        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=self.sample_rate,
            language_code=self.lang_code)

        # Detects speech in the audio file
        response = client.recognize(config, audio)

        result_li = []
        for result in response.results:
            result_li.append(format(result.alternatives[0].transcript))

        return result_li
Esempio n. 2
0
def transcribe_file(speech_file):
    path_ = pathlib.Path.cwd()
    path_ = path_ / 'teste de fluencia-2b49c4cc975c.json'
    """Transcribe the given audio file."""
    # client = speech_v1.SpeechClient()  # cria o cliente da API do google
    client = speech_v1.SpeechClient.from_service_account_json(path_)

    with io.open(speech_file, 'rb') as audio_file:  # abre o arquivo de audio
        content = audio_file.read()  # ler o conteudo
        audio = types.RecognitionAudio(content=content)  # define o tipo

    config = types.RecognitionConfig(  # configuração a ser usada no reconhecimento
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,
        language_code='pt-BR',
        speech_contexts=[{
            "phrases": utils.list_animals
        }]  # dica para reconhecimento
    )

    response = client.recognize(config, audio)  # reconhecimento do audio

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.

    # for result in response.results:
    #     # The first alternative is the most likely one for this portion.
    #     print(u'Transcript: {}'.format(result.alternatives[0].transcript))

    return response
Esempio n. 3
0
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech_v1
    from google.cloud.speech_v1 import enums
    from google.cloud.speech_v1 import types
    import io
    client = speech_v1.SpeechClient()

    # [START speech_python_migration_sync_request]
    # [START speech_python_migration_config]
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='ja-JP')
    # [END speech_python_migration_config]

    # [START speech_python_migration_sync_response]
    response = client.recognize(config, audio)
    # [END speech_python_migration_sync_request]
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
def return_recognized(PATH, words):
    SCOPES = ['https://www.googleapis.com/auth/cloud-platform']
    SERVICE_ACCOUNT_FILE = 'C:/Users/Janek/PycharmProjects/test/klucz.json'
    credentials = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    client = speech.SpeechClient(credentials=credentials)

    file_name = os.path.join(os.path.dirname(__file__), 'resources', PATH)

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='pl-PL',
        speech_contexts=[speech.types.SpeechContext(phrases=words)])

    # Detects speech in the audio file
    response = client.recognize(config, audio)
    transcribed = {}

    for result in response.results:
        # print('Transcript: {}, {}'.format(result.alternatives[0].transcript, result.alternatives[0].confidence))
        transcribed[result.alternatives[0].
                    transcript] = result.alternatives[0].confidence

    return transcribed
Esempio n. 5
0
async def speech_to_text(e):
    opts = e.pattern_match.group(1) or ""
    args, _ = parse_arguments(opts, ['lang'])

    lang = args.get('lang', DEFAULT_LANG)
    await e.edit("**Transcribing...**")

    message = await e.get_reply_message()
    file = message.audio or message.voice

    if not file:
        await e.edit("**No audio file specified**", delete_in=3)
        return

    file = await bot.download_file(file)

    content = io.BytesIO(file)
    audio = types.RecognitionAudio(content=file)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.OGG_OPUS,
        sample_rate_hertz=16000,
        language_code=lang)

    response = STTClient.long_running_recognize(config, audio)
    op_result = response.result()
    result = op_result.results[0].alternatives[0]

    output = f"**Transcript:** {result.transcript}\n\n**Confidence:** __{round(result.confidence, 5)}__"
    await e.edit(output)
def transcribe_with_word_time_offsets(
    speech_content: bytes, ) -> Iterable[Tuple[str, float, float]]:
    """Recognize words with time offsets from a speech.

    Args:
        speech_content: Binary data of the speech.

    Yields:
        The word with start time and end time that api recognized.

            [
                ('여기요', 0.0, 2.0),
                ('저기요', 3.6, 5.4),
                ('저', 5.4, 9.2),
                ('밖에서', 9.2, 9.6),
                ('장애인', 9.6, 10.0),
                ('주차', 10.0, 10.3),
                ('가능', 10.3, 10.5),
                ('까만색', 10.5, 11.3),
                ('소나타', 11.3, 11.7),
                ('글', 11.7, 11.8),
                ('찾아요', 11.8, 12.2),
                ('근데요', 12.2, 13.2)
            ]

    See:
        https://cloud.google.com/speech-to-text/docs/sync-recognize

    """
    client = SpeechClient()

    audio = types.RecognitionAudio(content=speech_content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="ko-KR",
        enable_word_time_offsets=True,
    )

    response = client.recognize(config, audio)

    for result in response.results:
        alternative = result.alternatives[0]

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            yield (
                word,
                start_time.seconds + start_time.nanos * 1e-9,
                end_time.seconds + end_time.nanos * 1e-9,
            )
    def test_inherited_method(self):
        from google.cloud.speech_v1 import types

        client = self._make_one()

        config = types.RecognitionConfig(encoding='FLAC')
        audio = types.RecognitionAudio(uri='http://foo.com/bar.wav')
        with mock.patch.object(client, '_recognize') as recognize:
            client.recognize(config, audio)

            # Assert that the underlying GAPIC method was called as expected.
            recognize.assert_called_once_with(
                types.RecognizeRequest(
                    config=config,
                    audio=audio,
                ), None)
    def test_inherited_method(self):
        from google.cloud.speech_v1 import types

        client = self._make_one()

        config = types.RecognitionConfig(encoding='FLAC')
        audio = types.RecognitionAudio(uri='http://foo.com/bar.wav')
        patch = mock.patch.object(client, '_recognize', autospec=True)
        with patch as recognize:
            client.recognize(config, audio)

            # Assert that the underlying GAPIC method was called as expected.
            assert recognize.call_count == 1
            _, args, _ = recognize.mock_calls[0]
            assert args[0] == types.RecognizeRequest(
                config=config,
                audio=audio,
            )
Esempio n. 9
0
def transcribe_gcs(gcs_uri):
    """Transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    # [START speech_python_migration_config_gcs]
    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='ja-JP')
    # [END speech_python_migration_config_gcs]

    response = client.recognize(config, audio)
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
Esempio n. 10
0
def upload():
    return render_template('post.html')
     uri = request.files['audio'].stream.read()
     # #uri = open(stream, "rb", buffering=0)
     #
     # print("Debug")
     # print(request)
     # print(request.form)
     # print(request.files)
     # print("Debug")
     # # text = convert_speech_to_text(audio)
     #
     client = speech_v1.SpeechClient()
     #
     encoding = enums.RecognitionConfig.AudioEncoding.FLAC
     sample_rate_hertz = 48000
     language_code = 'en-US'
     #config = {'encoding': encoding, 'sample_rate_hertz': sample_rate_hertz, 'language_code': language_code}
     config = types.RecognitionConfig(encoding=encoding,
         sample_rate_hertz=sample_rate_hertz,
         language_code=language_code)
     #uri = 'gs://bucket_name/file_name.flac'
     # print(str(uri))
     audio = types.RecognitionAudio(content=uri)
     # print("AUDIO: " + str(audio))
     # print("CONFIG: " + str(config))
     response = client.recognize(config, audio)
     # print("AAAAAA"+str(response))
     # print("BBBBBBB"+str(response.results))
     # print("CCCCCCC"+str(response.results[0].alternatives))
     # print(response.results[0].alternatives[0].transcript)
     #
     # return response.results[0].alternatives[0].transcript
     sample_txt = ""
     x = io.open('sample.txt', mode='r', encoding='utf-8', errors='ignore')
     for line in x:
         sample_txt += line
     print(sample_txt)
     return sample_txt
Esempio n. 11
0
def run_quickstart():
    client = speech_v1.SpeechClient()
    # The name of the audio file to transcribe
    file_name = '../../sound/sample.wav'
    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=speech_v1.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='ko-KR',
        audio_channel_count=2)

    # Detects speech in the audio file
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=90)

    def closecallback():
        window.destroy()

    window = tkinter.Tk()
    window.title("AI Speaker Test")
    window.geometry("640x400+100+100")
    window.resizable(False, False)
    text = tkinter.Text(window)

    for result in response.results:
        text.insert(tkinter.CURRENT, '음성출력\n')
        if '메시지' in result.alternatives[0].transcript:
            text.insert(tkinter.CURRENT, result.alternatives[0].transcript)
            text.pack()
            button = tkinter.Button(window,
                                    text='Close',
                                    command=closecallback)
            button.place(x=0, y=350, relx=0.5)
            window.mainloop()
            playsound(file_name)
Esempio n. 12
0
 def audio_file_reader(self, file_name):
     with io.open(file_name, 'rb') as audio_file:
         content = audio_file.read()
         audio = types.RecognitionAudio(content=content)
     return audio
Esempio n. 13
0
##code using google cloud API, open source https://pypi.org/project/google-cloud-speech/

from google.cloud import speech_v1 as speech
from google.cloud.speech_v1 import enums
from google.cloud.speech_v1 import types
client = speech_v1.SpeechClient()

file_name = os.path.join(os.path.dirname(__file__), 'resources', 'audio.raw')

# Loads the audio into memory
with io.open(file_name, 'rb') as audio_file:
    content = audio_file.read()
    audio = types.RecognitionAudio(content=content)

config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=16000,
    language_code='en-US')

# Detects speech in the audio file
response = client.recognize(config, audio)

for result in response.results:
    print('Transcript: {}'.format(result.alternatives[0].transcript))
Esempio n. 14
0
def sample_recognize():
    #print("Record")
    # CHUNK = 1024
    # FORMAT = pyaudio.paInt16
    # CHANNELS = 2
    # RATE = 44100
    # RECORD_SECONDS = 5
    # WAVE_OUTPUT_FILENAME = "output.wav"

    # p = pyaudio.PyAudio()

    # stream = p.open(format=FORMAT,
    #                 channels=CHANNELS,
    #                 rate=RATE,
    #                 input=True,
    #                 frames_per_buffer=CHUNK)

    # print("* recording")

    # frames = []

    # # you'll probably want to experiment on threshold
    # # depends how noisy the signal
    # threshold = 10000
    # max_value = 0

    # count=0
    # #for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    # while count<100:
    #     data = stream.read(CHUNK)
    #     as_ints = array('h', data)
    #     max_value = max(as_ints)
    #     #print(max_value)
    #     if max_value > threshold:
    #         print("More Than Threshold")
    #         count=0
    #     else:
    #         print("Less Than Threshold")
    #         count+=1
    #     frames.append(data)

    # print("* done recording")

    # stream.stop_stream()
    # stream.close()
    # p.terminate()

    # wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    # wf.setnchannels(CHANNELS)
    # wf.setsampwidth(p.get_sample_size(FORMAT))
    # wf.setframerate(RATE)
    # wf.writeframes(b''.join(frames))
    # wf.close()

    client = speech_v1.SpeechClient()
    # storage_uri = 'gs://cloud-samples-data/speech/multi.wav'

    # The number of channels in the input audio file (optional)
    audio_channel_count = 1

    # When set to true, each audio channel will be recognized separately.
    # The recognition result will contain a channel_tag field to state which
    # channel that result belongs to
    enable_separate_recognition_per_channel = True

    # The language of the supplied audio
    language_code = "en-US"
    config = {
        "audio_channel_count": audio_channel_count,
        "enable_separate_recognition_per_channel":
        enable_separate_recognition_per_channel,
        "encoding": "LINEAR16",
        "language_code": language_code,
        # "sample_rate_hertz":44100
    }

    voice_file = 'tmp.wav'
    with io.open(voice_file, 'rb') as f:
        audio_file = f.read()

    audio = types.RecognitionAudio(content=audio_file)
    response = client.recognize(config, audio)
    text = ''
    for result in response.results:
        # channel_tag to recognize which audio channel this result is for
        #print(u"Channel tag: {}".format(result.channel_tag))
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        #print(u"Transcript: {}".format(alternative.transcript))
        text = text + alternative.transcript
        #print(text)
    return text