def speech2text(self, file_path): # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe file_name = file_path # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.sample_rate, language_code=self.lang_code) # Detects speech in the audio file response = client.recognize(config, audio) result_li = [] for result in response.results: result_li.append(format(result.alternatives[0].transcript)) return result_li
def transcribe_file(speech_file): path_ = pathlib.Path.cwd() path_ = path_ / 'teste de fluencia-2b49c4cc975c.json' """Transcribe the given audio file.""" # client = speech_v1.SpeechClient() # cria o cliente da API do google client = speech_v1.SpeechClient.from_service_account_json(path_) with io.open(speech_file, 'rb') as audio_file: # abre o arquivo de audio content = audio_file.read() # ler o conteudo audio = types.RecognitionAudio(content=content) # define o tipo config = types.RecognitionConfig( # configuração a ser usada no reconhecimento encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=48000, language_code='pt-BR', speech_contexts=[{ "phrases": utils.list_animals }] # dica para reconhecimento ) response = client.recognize(config, audio) # reconhecimento do audio # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. # for result in response.results: # # The first alternative is the most likely one for this portion. # print(u'Transcript: {}'.format(result.alternatives[0].transcript)) return response
def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech_v1 from google.cloud.speech_v1 import enums from google.cloud.speech_v1 import types import io client = speech_v1.SpeechClient() # [START speech_python_migration_sync_request] # [START speech_python_migration_config] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='ja-JP') # [END speech_python_migration_config] # [START speech_python_migration_sync_response] response = client.recognize(config, audio) # [END speech_python_migration_sync_request] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u'Transcript: {}'.format(result.alternatives[0].transcript))
def return_recognized(PATH, words): SCOPES = ['https://www.googleapis.com/auth/cloud-platform'] SERVICE_ACCOUNT_FILE = 'C:/Users/Janek/PycharmProjects/test/klucz.json' credentials = service_account.Credentials.from_service_account_file( SERVICE_ACCOUNT_FILE, scopes=SCOPES) client = speech.SpeechClient(credentials=credentials) file_name = os.path.join(os.path.dirname(__file__), 'resources', PATH) # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='pl-PL', speech_contexts=[speech.types.SpeechContext(phrases=words)]) # Detects speech in the audio file response = client.recognize(config, audio) transcribed = {} for result in response.results: # print('Transcript: {}, {}'.format(result.alternatives[0].transcript, result.alternatives[0].confidence)) transcribed[result.alternatives[0]. transcript] = result.alternatives[0].confidence return transcribed
async def speech_to_text(e): opts = e.pattern_match.group(1) or "" args, _ = parse_arguments(opts, ['lang']) lang = args.get('lang', DEFAULT_LANG) await e.edit("**Transcribing...**") message = await e.get_reply_message() file = message.audio or message.voice if not file: await e.edit("**No audio file specified**", delete_in=3) return file = await bot.download_file(file) content = io.BytesIO(file) audio = types.RecognitionAudio(content=file) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.OGG_OPUS, sample_rate_hertz=16000, language_code=lang) response = STTClient.long_running_recognize(config, audio) op_result = response.result() result = op_result.results[0].alternatives[0] output = f"**Transcript:** {result.transcript}\n\n**Confidence:** __{round(result.confidence, 5)}__" await e.edit(output)
def transcribe_with_word_time_offsets( speech_content: bytes, ) -> Iterable[Tuple[str, float, float]]: """Recognize words with time offsets from a speech. Args: speech_content: Binary data of the speech. Yields: The word with start time and end time that api recognized. [ ('여기요', 0.0, 2.0), ('저기요', 3.6, 5.4), ('저', 5.4, 9.2), ('밖에서', 9.2, 9.6), ('장애인', 9.6, 10.0), ('주차', 10.0, 10.3), ('가능', 10.3, 10.5), ('까만색', 10.5, 11.3), ('소나타', 11.3, 11.7), ('글', 11.7, 11.8), ('찾아요', 11.8, 12.2), ('근데요', 12.2, 13.2) ] See: https://cloud.google.com/speech-to-text/docs/sync-recognize """ client = SpeechClient() audio = types.RecognitionAudio(content=speech_content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code="ko-KR", enable_word_time_offsets=True, ) response = client.recognize(config, audio) for result in response.results: alternative = result.alternatives[0] for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time yield ( word, start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9, )
def test_inherited_method(self): from google.cloud.speech_v1 import types client = self._make_one() config = types.RecognitionConfig(encoding='FLAC') audio = types.RecognitionAudio(uri='http://foo.com/bar.wav') with mock.patch.object(client, '_recognize') as recognize: client.recognize(config, audio) # Assert that the underlying GAPIC method was called as expected. recognize.assert_called_once_with( types.RecognizeRequest( config=config, audio=audio, ), None)
def test_inherited_method(self): from google.cloud.speech_v1 import types client = self._make_one() config = types.RecognitionConfig(encoding='FLAC') audio = types.RecognitionAudio(uri='http://foo.com/bar.wav') patch = mock.patch.object(client, '_recognize', autospec=True) with patch as recognize: client.recognize(config, audio) # Assert that the underlying GAPIC method was called as expected. assert recognize.call_count == 1 _, args, _ = recognize.mock_calls[0] assert args[0] == types.RecognizeRequest( config=config, audio=audio, )
def transcribe_gcs(gcs_uri): """Transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() # [START speech_python_migration_config_gcs] audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='ja-JP') # [END speech_python_migration_config_gcs] response = client.recognize(config, audio) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u'Transcript: {}'.format(result.alternatives[0].transcript))
def upload(): return render_template('post.html') uri = request.files['audio'].stream.read() # #uri = open(stream, "rb", buffering=0) # # print("Debug") # print(request) # print(request.form) # print(request.files) # print("Debug") # # text = convert_speech_to_text(audio) # client = speech_v1.SpeechClient() # encoding = enums.RecognitionConfig.AudioEncoding.FLAC sample_rate_hertz = 48000 language_code = 'en-US' #config = {'encoding': encoding, 'sample_rate_hertz': sample_rate_hertz, 'language_code': language_code} config = types.RecognitionConfig(encoding=encoding, sample_rate_hertz=sample_rate_hertz, language_code=language_code) #uri = 'gs://bucket_name/file_name.flac' # print(str(uri)) audio = types.RecognitionAudio(content=uri) # print("AUDIO: " + str(audio)) # print("CONFIG: " + str(config)) response = client.recognize(config, audio) # print("AAAAAA"+str(response)) # print("BBBBBBB"+str(response.results)) # print("CCCCCCC"+str(response.results[0].alternatives)) # print(response.results[0].alternatives[0].transcript) # # return response.results[0].alternatives[0].transcript sample_txt = "" x = io.open('sample.txt', mode='r', encoding='utf-8', errors='ignore') for line in x: sample_txt += line print(sample_txt) return sample_txt
def run_quickstart(): client = speech_v1.SpeechClient() # The name of the audio file to transcribe file_name = '../../sound/sample.wav' # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=speech_v1.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='ko-KR', audio_channel_count=2) # Detects speech in the audio file operation = client.long_running_recognize(config=config, audio=audio) response = operation.result(timeout=90) def closecallback(): window.destroy() window = tkinter.Tk() window.title("AI Speaker Test") window.geometry("640x400+100+100") window.resizable(False, False) text = tkinter.Text(window) for result in response.results: text.insert(tkinter.CURRENT, '음성출력\n') if '메시지' in result.alternatives[0].transcript: text.insert(tkinter.CURRENT, result.alternatives[0].transcript) text.pack() button = tkinter.Button(window, text='Close', command=closecallback) button.place(x=0, y=350, relx=0.5) window.mainloop() playsound(file_name)
def audio_file_reader(self, file_name): with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) return audio
##code using google cloud API, open source https://pypi.org/project/google-cloud-speech/ from google.cloud import speech_v1 as speech from google.cloud.speech_v1 import enums from google.cloud.speech_v1 import types client = speech_v1.SpeechClient() file_name = os.path.join(os.path.dirname(__file__), 'resources', 'audio.raw') # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US') # Detects speech in the audio file response = client.recognize(config, audio) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript))
def sample_recognize(): #print("Record") # CHUNK = 1024 # FORMAT = pyaudio.paInt16 # CHANNELS = 2 # RATE = 44100 # RECORD_SECONDS = 5 # WAVE_OUTPUT_FILENAME = "output.wav" # p = pyaudio.PyAudio() # stream = p.open(format=FORMAT, # channels=CHANNELS, # rate=RATE, # input=True, # frames_per_buffer=CHUNK) # print("* recording") # frames = [] # # you'll probably want to experiment on threshold # # depends how noisy the signal # threshold = 10000 # max_value = 0 # count=0 # #for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): # while count<100: # data = stream.read(CHUNK) # as_ints = array('h', data) # max_value = max(as_ints) # #print(max_value) # if max_value > threshold: # print("More Than Threshold") # count=0 # else: # print("Less Than Threshold") # count+=1 # frames.append(data) # print("* done recording") # stream.stop_stream() # stream.close() # p.terminate() # wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') # wf.setnchannels(CHANNELS) # wf.setsampwidth(p.get_sample_size(FORMAT)) # wf.setframerate(RATE) # wf.writeframes(b''.join(frames)) # wf.close() client = speech_v1.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/multi.wav' # The number of channels in the input audio file (optional) audio_channel_count = 1 # When set to true, each audio channel will be recognized separately. # The recognition result will contain a channel_tag field to state which # channel that result belongs to enable_separate_recognition_per_channel = True # The language of the supplied audio language_code = "en-US" config = { "audio_channel_count": audio_channel_count, "enable_separate_recognition_per_channel": enable_separate_recognition_per_channel, "encoding": "LINEAR16", "language_code": language_code, # "sample_rate_hertz":44100 } voice_file = 'tmp.wav' with io.open(voice_file, 'rb') as f: audio_file = f.read() audio = types.RecognitionAudio(content=audio_file) response = client.recognize(config, audio) text = '' for result in response.results: # channel_tag to recognize which audio channel this result is for #print(u"Channel tag: {}".format(result.channel_tag)) # First alternative is the most probable result alternative = result.alternatives[0] #print(u"Transcript: {}".format(alternative.transcript)) text = text + alternative.transcript #print(text) return text