Ejemplo n.º 1
0
def get_client(lang='en-US',
               sample_rate=16000,
               interim_results=False,
               single_utterance=True,
               phrase_key=""):
    """
    Helper to return client and config
    """
    client = SpeechClient()
    config = types.StreamingRecognitionConfig(
        config=types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=sample_rate,
            language_code=lang,
            # Enhanced models are only available to projects that
            # opt in for audio data collection.
            use_enhanced=True,
            # A model must be specified to use enhanced model.
            model="command_and_search",
            speech_contexts=[
                types.SpeechContext(phrases=PhraseGenerator.get_phrases(
                    "app/config.json", phrase_key), )
            ]),
        interim_results=interim_results,
        single_utterance=single_utterance)
    print(str(config))
    return client, config
Ejemplo n.º 2
0
    def configureAPI(self):

        if self.title.find('.flac') != -1:
            with open(os.path.join(self.path, self.title), 'rb') as audio_file:
                content = audio_file.read()
                self.audio = types.RecognitionAudio(content=content)

        else:
            self.audio = types.RecognitionAudio(uri="gs://twiml-mp3/" +
                                                self.title + ".flac")

        self.config = types.RecognitionConfig(
            encoding=self.encoding,
            sample_rate_hertz=self.sample_rate,
            language_code=self.language_code,
            enable_automatic_punctuation=self.punctuation,
            enable_speaker_diarization=self.diarization,
            diarization_speaker_count=self.num_speakers,
            audio_channel_count=1,
            use_enhanced=self.enhanced,
            model=self.model,
            enable_word_time_offsets=self.time_offsets,
            enable_word_confidence=self.word_confidence,
            max_alternatives=self.max_alternatives,
            metadata=self.metadata,
            speech_contexts=[types.SpeechContext(phrases=self.phrases)])
Ejemplo n.º 3
0
    def process(self, loop):
        """
        Audio stream recognition and result parsing
        """
        #You can add speech contexts for better recognition
        cap_speech_context = types.SpeechContext(**self.context)
        metadata = types.RecognitionMetadata(**self.metadata)
        client = speech.SpeechClient()
        config = types.RecognitionConfig(encoding=self.encoding,
                                         sample_rate_hertz=self.rate,
                                         language_code=self.language,
                                         speech_contexts=[
                                             cap_speech_context,
                                         ],
                                         enable_automatic_punctuation=True,
                                         model=self.model,
                                         metadata=metadata)

        streaming_config = types.StreamingRecognitionConfig(
            config=config,
            interim_results=self.interim_results,
            single_utterance=self.single_utterance)
        audio_generator = self.stream_generator()
        requests = iter(
            types.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        #print('process',type(responses))
        try:
            #print('process')
            for response in responses:
                #print('process received')
                if self.terminated:
                    break
                if not response.results:
                    continue
                result = response.results[0]
                if not result.alternatives:
                    continue
                speechData = MessageToDict(response)
                global_async_worker.add_task(self.async_callback(speechData))

                # debug
                transcript = result.alternatives[0].transcript

                print('>>', transcript, "(OK)" if result.is_final else "")
        except Exception as e:
            print('process excepted', e)
            self.start()
Ejemplo n.º 4
0
 def gspeech_client(self):
     """Creates the Google Speech API client, configures it, and sends/gets
     audio/text data for parsing.
     """
     language_code = 'en-US'
     # Hints for the API
     context = types.SpeechContext(phrases=self.context)
     client = speech.SpeechClient()
     # Create metadata object, helps processing
     metadata = types.RecognitionMetadata()
     # Interaction Type:
     # VOICE_SEARCH: Transcribe spoken questions and queries into text.
     # VOICE_COMMAND: Transcribe voice commands, such as for controlling a device.
     metadata.interaction_type = (
         enums.RecognitionMetadata.InteractionType.VOICE_COMMAND)
     # Microphone Distance:
     # NEARFIELD: The audio was captured from a closely placed microphone.
     # MIDFIELD: The speaker is within 3 meters of the microphone.
     # FARFIELD: The speaker is more than 3 meters away from the microphone.
     metadata.microphone_distance = (
         enums.RecognitionMetadata.MicrophoneDistance.MIDFIELD)
     # Device Type:
     # PC: Speech was recorded using a personal computer or tablet.
     # VEHICLE: Speech was recorded in a vehicle.
     # OTHER_OUTDOOR_DEVICE: Speech was recorded outdoors.
     # OTHER_INDOOR_DEVICE: Speech was recorded indoors.
     metadata.recording_device_type = (
         enums.RecognitionMetadata.RecordingDeviceType.PC)
     # Media Type:
     # AUDIO: The speech data is an audio recording.
     # VIDEO: The speech data originally recorded on a video.
     metadata.original_media_type = (
         enums.RecognitionMetadata.OriginalMediaType.AUDIO)
     config = types.RecognitionConfig(
         encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
         sample_rate_hertz=16000,
         language_code=language_code,
         speech_contexts=[context],
         use_enhanced=True,
         model='command_and_search',
         metadata=metadata)
     streaming_config = types.StreamingRecognitionConfig(
         config=config, single_utterance=False, interim_results=False)
     # Hack from Google Speech Python docs, very pythonic c:
     requests = (types.StreamingRecognizeRequest(audio_content=content)
                 for content in self.generator())
     responses = client.streaming_recognize(streaming_config, requests)
     self._listen_print_loop(responses)
Ejemplo n.º 5
0
    def listen(self, language_code='ja-JP'):
        """Listen."""
        # See http://g.co/cloud/speech/docs/languages
        # for a list of supported languages.

        client = speech.SpeechClient()
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=self.rate,
            model=None,
            speech_contexts=[types.SpeechContext(
            )],
            language_code=language_code)
        streaming_config = types.StreamingRecognitionConfig(
            config=config,
            single_utterance=True,
            interim_results=True
        )

        self.callbacks.get("ready", lambda: True)()

        with MicrophoneStream(self.rate, int(self.rate/10)) as stream:

            self.callbacks.get("start", lambda: True)()

            while True:
                try:
                    audio_generator = stream.generator()
                    requests = (types.StreamingRecognizeRequest(audio_content=content)
                                for content in audio_generator)
                    responses = client.streaming_recognize(streaming_config, requests)

                    self.listen_print_loop(responses)

                except exceptions.OutOfRange:
                    print("Time exceeded.(OutOfRange)")
                except exceptions.ServiceUnavailable:
                    print("Connection closed.(ServiceUnavailable)")
                except KeyboardInterrupt:
                    print("KeyboardInterrupt.")
                    break
                except:
                    print("Unexpected error:", sys.exc_info()[0])
                    raise

            self.callbacks.get("end", lambda: True)()
 def translate_with_timestamps(self, gs_uri, encoding, mode, hint):
     audio = types.RecognitionAudio(uri=gs_uri)
     config = types.RecognitionConfig(
         encoding=encoding,
         language_code=mode,
         enable_word_time_offsets=True,
         speech_contexts=[types.SpeechContext(phrases=hint)],
         enable_word_confidence=True)
     operation = self.client.long_running_recognize(config=config,
                                                    audio=audio)
     results = []
     for result in operation.result().results:
         alternatives = result.alternatives
         if not alternatives:
             continue
         alternative = alternatives[0]
         results.append([alternative.transcript, alternative.confidence])
         for word_info in alternative.words:
             word = word_info.word
             start_time = word_info.start_time.seconds + word_info.start_time.nanos * 1e-9
             end_time = word_info.end_time.seconds + word_info.end_time.nanos * 1e-9
             confidence = word_info.confidence
             results.append([word, start_time, end_time, confidence])
     return results
Ejemplo n.º 7
0
    return base64.b64encode(audio_content)


#### Specify config
## Specify metadata
metadata = speech.types.RecognitionMetadata()
metadata.interaction_type = speech.enums.RecognitionMetadata.InteractionType.VOICE_SEARCH
metadata.industry_naics_code_of_audio = 531210
metadata.microphone_distance = speech.enums.RecognitionMetadata.MicrophoneDistance.NEARFIELD
metadata.original_media_type = speech.enums.RecognitionMetadata.OriginalMediaType.AUDIO
# Note: change this if doing smartphone audio recording
metadata.recording_device_type = speech.enums.RecognitionMetadata.RecordingDeviceType.PC
metadata.original_mime_type = "audio/flac"
metadata.audio_topic = "Voice search for real estate properties"
## Phrase hints
speech_context = types.SpeechContext(phrases=SPEECH_CONTEXT_PHRASES)
## Final config
config = types.RecognitionConfig(encoding="FLAC",
                                 language_code='en-au',
                                 max_alternatives=1,
                                 model="command_and_search",
                                 enable_word_confidence=True,
                                 metadata=metadata,
                                 speech_contexts=[speech_context],
                                 enable_automatic_punctuation=True)

### Send off audio and config to get translated
response = client.recognize(config, audio)
for result in response.results:
    print('Transcript: {}'.format(result.alternatives[0].transcript))
sen = result.alternatives[0].transcript