def get_client(lang='en-US', sample_rate=16000, interim_results=False, single_utterance=True, phrase_key=""): """ Helper to return client and config """ client = SpeechClient() config = types.StreamingRecognitionConfig( config=types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate, language_code=lang, # Enhanced models are only available to projects that # opt in for audio data collection. use_enhanced=True, # A model must be specified to use enhanced model. model="command_and_search", speech_contexts=[ types.SpeechContext(phrases=PhraseGenerator.get_phrases( "app/config.json", phrase_key), ) ]), interim_results=interim_results, single_utterance=single_utterance) print(str(config)) return client, config
def configureAPI(self): if self.title.find('.flac') != -1: with open(os.path.join(self.path, self.title), 'rb') as audio_file: content = audio_file.read() self.audio = types.RecognitionAudio(content=content) else: self.audio = types.RecognitionAudio(uri="gs://twiml-mp3/" + self.title + ".flac") self.config = types.RecognitionConfig( encoding=self.encoding, sample_rate_hertz=self.sample_rate, language_code=self.language_code, enable_automatic_punctuation=self.punctuation, enable_speaker_diarization=self.diarization, diarization_speaker_count=self.num_speakers, audio_channel_count=1, use_enhanced=self.enhanced, model=self.model, enable_word_time_offsets=self.time_offsets, enable_word_confidence=self.word_confidence, max_alternatives=self.max_alternatives, metadata=self.metadata, speech_contexts=[types.SpeechContext(phrases=self.phrases)])
def process(self, loop): """ Audio stream recognition and result parsing """ #You can add speech contexts for better recognition cap_speech_context = types.SpeechContext(**self.context) metadata = types.RecognitionMetadata(**self.metadata) client = speech.SpeechClient() config = types.RecognitionConfig(encoding=self.encoding, sample_rate_hertz=self.rate, language_code=self.language, speech_contexts=[ cap_speech_context, ], enable_automatic_punctuation=True, model=self.model, metadata=metadata) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=self.interim_results, single_utterance=self.single_utterance) audio_generator = self.stream_generator() requests = iter( types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) #print('process',type(responses)) try: #print('process') for response in responses: #print('process received') if self.terminated: break if not response.results: continue result = response.results[0] if not result.alternatives: continue speechData = MessageToDict(response) global_async_worker.add_task(self.async_callback(speechData)) # debug transcript = result.alternatives[0].transcript print('>>', transcript, "(OK)" if result.is_final else "") except Exception as e: print('process excepted', e) self.start()
def gspeech_client(self): """Creates the Google Speech API client, configures it, and sends/gets audio/text data for parsing. """ language_code = 'en-US' # Hints for the API context = types.SpeechContext(phrases=self.context) client = speech.SpeechClient() # Create metadata object, helps processing metadata = types.RecognitionMetadata() # Interaction Type: # VOICE_SEARCH: Transcribe spoken questions and queries into text. # VOICE_COMMAND: Transcribe voice commands, such as for controlling a device. metadata.interaction_type = ( enums.RecognitionMetadata.InteractionType.VOICE_COMMAND) # Microphone Distance: # NEARFIELD: The audio was captured from a closely placed microphone. # MIDFIELD: The speaker is within 3 meters of the microphone. # FARFIELD: The speaker is more than 3 meters away from the microphone. metadata.microphone_distance = ( enums.RecognitionMetadata.MicrophoneDistance.MIDFIELD) # Device Type: # PC: Speech was recorded using a personal computer or tablet. # VEHICLE: Speech was recorded in a vehicle. # OTHER_OUTDOOR_DEVICE: Speech was recorded outdoors. # OTHER_INDOOR_DEVICE: Speech was recorded indoors. metadata.recording_device_type = ( enums.RecognitionMetadata.RecordingDeviceType.PC) # Media Type: # AUDIO: The speech data is an audio recording. # VIDEO: The speech data originally recorded on a video. metadata.original_media_type = ( enums.RecognitionMetadata.OriginalMediaType.AUDIO) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code=language_code, speech_contexts=[context], use_enhanced=True, model='command_and_search', metadata=metadata) streaming_config = types.StreamingRecognitionConfig( config=config, single_utterance=False, interim_results=False) # Hack from Google Speech Python docs, very pythonic c: requests = (types.StreamingRecognizeRequest(audio_content=content) for content in self.generator()) responses = client.streaming_recognize(streaming_config, requests) self._listen_print_loop(responses)
def listen(self, language_code='ja-JP'): """Listen.""" # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.rate, model=None, speech_contexts=[types.SpeechContext( )], language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, single_utterance=True, interim_results=True ) self.callbacks.get("ready", lambda: True)() with MicrophoneStream(self.rate, int(self.rate/10)) as stream: self.callbacks.get("start", lambda: True)() while True: try: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) self.listen_print_loop(responses) except exceptions.OutOfRange: print("Time exceeded.(OutOfRange)") except exceptions.ServiceUnavailable: print("Connection closed.(ServiceUnavailable)") except KeyboardInterrupt: print("KeyboardInterrupt.") break except: print("Unexpected error:", sys.exc_info()[0]) raise self.callbacks.get("end", lambda: True)()
def translate_with_timestamps(self, gs_uri, encoding, mode, hint): audio = types.RecognitionAudio(uri=gs_uri) config = types.RecognitionConfig( encoding=encoding, language_code=mode, enable_word_time_offsets=True, speech_contexts=[types.SpeechContext(phrases=hint)], enable_word_confidence=True) operation = self.client.long_running_recognize(config=config, audio=audio) results = [] for result in operation.result().results: alternatives = result.alternatives if not alternatives: continue alternative = alternatives[0] results.append([alternative.transcript, alternative.confidence]) for word_info in alternative.words: word = word_info.word start_time = word_info.start_time.seconds + word_info.start_time.nanos * 1e-9 end_time = word_info.end_time.seconds + word_info.end_time.nanos * 1e-9 confidence = word_info.confidence results.append([word, start_time, end_time, confidence]) return results
return base64.b64encode(audio_content) #### Specify config ## Specify metadata metadata = speech.types.RecognitionMetadata() metadata.interaction_type = speech.enums.RecognitionMetadata.InteractionType.VOICE_SEARCH metadata.industry_naics_code_of_audio = 531210 metadata.microphone_distance = speech.enums.RecognitionMetadata.MicrophoneDistance.NEARFIELD metadata.original_media_type = speech.enums.RecognitionMetadata.OriginalMediaType.AUDIO # Note: change this if doing smartphone audio recording metadata.recording_device_type = speech.enums.RecognitionMetadata.RecordingDeviceType.PC metadata.original_mime_type = "audio/flac" metadata.audio_topic = "Voice search for real estate properties" ## Phrase hints speech_context = types.SpeechContext(phrases=SPEECH_CONTEXT_PHRASES) ## Final config config = types.RecognitionConfig(encoding="FLAC", language_code='en-au', max_alternatives=1, model="command_and_search", enable_word_confidence=True, metadata=metadata, speech_contexts=[speech_context], enable_automatic_punctuation=True) ### Send off audio and config to get translated response = client.recognize(config, audio) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) sen = result.alternatives[0].transcript