Example #1
0
    def sync_recognize(self, sample, language_code=None, max_alternatives=None,
                       profanity_filter=None, speech_context=None):
        """Synchronous Speech Recognition.

        .. _sync_recognize: https://cloud.google.com/speech/reference/\
                            rest/v1beta1/speech/syncrecognize

        See `sync_recognize`_.

        :type sample: :class:`~google.cloud.speech.sample.Sample`
        :param sample: Instance of ``Sample`` containing audio information.

        :type language_code: str
        :param language_code: (Optional) The language of the supplied audio as
                              BCP-47 language tag. Example: ``'en-GB'``.
                              If omitted, defaults to ``'en-US'``.

        :type max_alternatives: int
        :param max_alternatives: (Optional) Maximum number of recognition
                                 hypotheses to be returned. The server may
                                 return fewer than maxAlternatives.
                                 Valid values are 0-30. A value of 0 or 1
                                 will return a maximum of 1. Defaults to 1

        :type profanity_filter: bool
        :param profanity_filter: If True, the server will attempt to filter
                                 out profanities, replacing all but the
                                 initial character in each filtered word with
                                 asterisks, e.g. ``'f***'``. If False or
                                 omitted, profanities won't be filtered out.

        :type speech_context: list
        :param speech_context: A list of strings (max 50) containing words and
                               phrases "hints" so that the speech recognition
                               is more likely to recognize them. This can be
                               used to improve the accuracy for specific words
                               and phrases. This can also be used to add new
                               words to the vocabulary of the recognizer.

        :rtype: list
        :returns: List of :class:`google.cloud.speech.result.Result` objects.

        :raises: ValueError if there are no results.
        """
        config = RecognitionConfig(
            encoding=sample.encoding, sample_rate=sample.sample_rate,
            language_code=language_code, max_alternatives=max_alternatives,
            profanity_filter=profanity_filter,
            speech_context=SpeechContext(phrases=speech_context))
        audio = RecognitionAudio(content=sample.content,
                                 uri=sample.source_uri)
        api = self._gapic_api
        api_response = api.sync_recognize(config=config, audio=audio)

        # Sanity check: If we got no results back, raise an error.
        if len(api_response.results) == 0:
            raise ValueError('No results returned from the Speech API.')

        # Iterate over any results that came back.
        return [Result.from_pb(result) for result in api_response.results]
Example #2
0
    def async_recognize(self,
                        sample,
                        language_code=None,
                        max_alternatives=None,
                        profanity_filter=None,
                        speech_context=None):
        """Asychronous Recognize request to Google Speech API.

        .. _async_recognize: https://cloud.google.com/speech/reference/\
                             rest/v1beta1/speech/asyncrecognize

        See `async_recognize`_.

        :type sample: :class:`~google.cloud.speech.sample.Sample`
        :param sample: Instance of ``Sample`` containing audio information.

        :type language_code: str
        :param language_code: (Optional) The language of the supplied audio as
                              BCP-47 language tag. Example: ``'en-GB'``.
                              If omitted, defaults to ``'en-US'``.

        :type max_alternatives: int
        :param max_alternatives: (Optional) Maximum number of recognition
                                 hypotheses to be returned. The server may
                                 return fewer than maxAlternatives.
                                 Valid values are 0-30. A value of 0 or 1
                                 will return a maximum of 1. Defaults to 1

        :type profanity_filter: bool
        :param profanity_filter: If True, the server will attempt to filter
                                 out profanities, replacing all but the
                                 initial character in each filtered word with
                                 asterisks, e.g. ``'f***'``. If False or
                                 omitted, profanities won't be filtered out.

        :type speech_context: list
        :param speech_context: A list of strings (max 50) containing words and
                               phrases "hints" so that the speech recognition
                               is more likely to recognize them. This can be
                               used to improve the accuracy for specific words
                               and phrases. This can also be used to add new
                               words to the vocabulary of the recognizer.

        :rtype: :class:`~google.cloud.speech.operation.Operation`
        :returns: Instance of ``Operation`` to poll for results.
        """
        config = RecognitionConfig(
            encoding=sample.encoding,
            sample_rate=sample.sample_rate,
            language_code=language_code,
            max_alternatives=max_alternatives,
            profanity_filter=profanity_filter,
            speech_context=SpeechContext(phrases=speech_context))

        audio = RecognitionAudio(content=sample.content, uri=sample.source_uri)
        api = self._gapic_api
        operation_future = api.async_recognize(config=config, audio=audio)

        return Operation.from_pb(operation_future.last_operation_data(), self)
Example #3
0
    def test_ctor(self):
        from google.cloud import speech
        from google.cloud.speech.sample import Sample
        from google.cloud.proto.speech.v1beta1.cloud_speech_pb2 import (
            RecognitionConfig, SpeechContext, StreamingRecognitionConfig,
            StreamingRecognizeRequest)

        sample = Sample(content=self.AUDIO_CONTENT,
                        encoding=speech.Encoding.FLAC,
                        sample_rate=self.SAMPLE_RATE)
        language_code = 'US-en'
        max_alternatives = 2
        profanity_filter = True
        speech_context = SpeechContext(phrases=self.HINTS)
        single_utterance = True
        interim_results = False

        streaming_request = self._call_fut(sample, language_code,
                                           max_alternatives, profanity_filter,
                                           speech_context, single_utterance,
                                           interim_results)
        self.assertIsInstance(streaming_request, StreamingRecognizeRequest)

        # This isn't set by _make_streaming_request().
        # The first request can only have `streaming_config` set.
        # The following requests can only have `audio_content` set.
        self.assertEqual(streaming_request.audio_content, b'')

        self.assertIsInstance(streaming_request.streaming_config,
                              StreamingRecognitionConfig)
        streaming_config = streaming_request.streaming_config
        self.assertTrue(streaming_config.single_utterance)
        self.assertFalse(streaming_config.interim_results)
        config = streaming_config.config
        self.assertIsInstance(config, RecognitionConfig)
        self.assertEqual(config.encoding, 2)  # speech.Encoding.FLAC maps to 2.
        self.assertEqual(config.sample_rate, self.SAMPLE_RATE)
        self.assertEqual(config.language_code, language_code)
        self.assertEqual(config.max_alternatives, max_alternatives)
        self.assertTrue(config.profanity_filter)
        self.assertEqual(config.speech_context.phrases, self.HINTS)
Example #4
0
def _stream_requests(sample,
                     language_code=None,
                     max_alternatives=None,
                     profanity_filter=None,
                     speech_context=None,
                     single_utterance=None,
                     interim_results=None):
    """Generate stream of requests from sample.

    :type sample: :class:`~google.cloud.speech.sample.Sample`
    :param sample: Instance of ``Sample`` containing audio information.

    :type language_code: str
    :param language_code: (Optional) The language of the supplied audio as
                          BCP-47 language tag. Example: ``'en-GB'``.
                          If omitted, defaults to ``'en-US'``.

    :type max_alternatives: int
    :param max_alternatives: (Optional) Maximum number of recognition
                             hypotheses to be returned. The server may
                             return fewer than maxAlternatives.
                             Valid values are 0-30. A value of 0 or 1
                             will return a maximum of 1. Defaults to 1

    :type profanity_filter: bool
    :param profanity_filter: (Optional) If True, the server will attempt to
                             filter out profanities, replacing all but the
                             initial character in each filtered word with
                             asterisks, e.g. ``'f***'``. If False or
                             omitted, profanities won't be filtered out.

    :type speech_context: list
    :param speech_context: (Optional) A list of strings (max 50) containing
                           words and phrases "hints" so that the speech
                           recognition is more likely to recognize them.
                           This can be used to improve the accuracy for
                           specific words and phrases. This can also be used to
                           add new words to the vocabulary of the recognizer.

    :type single_utterance: bool
    :param single_utterance: (Optional) If false or omitted, the recognizer
                             will perform continuous recognition
                             (continuing to process audio even if the user
                             pauses speaking) until the client closes the
                             output stream (gRPC API) or when the maximum
                             time limit has been reached. Multiple
                             SpeechRecognitionResults with the is_final
                             flag set to true may be returned.

                             If true, the recognizer will detect a single
                             spoken utterance. When it detects that the
                             user has paused or stopped speaking, it will
                             return an END_OF_UTTERANCE event and cease
                             recognition. It will return no more than one
                             SpeechRecognitionResult with the is_final flag
                             set to true.

    :type interim_results: bool
    :param interim_results: (Optional) If true, interim results (tentative
                            hypotheses) may be returned as they become
                            available (these interim results are indicated
                            with the is_final=false flag). If false or
                            omitted, only is_final=true result(s) are
                            returned.
    """
    config_request = _make_streaming_request(
        sample,
        language_code=language_code,
        max_alternatives=max_alternatives,
        profanity_filter=profanity_filter,
        speech_context=SpeechContext(phrases=speech_context),
        single_utterance=single_utterance,
        interim_results=interim_results)

    # The config request MUST go first and not contain any audio data.
    yield config_request

    while True:
        data = sample.stream.read(sample.chunk_size)
        if not data:
            break
        yield StreamingRecognizeRequest(audio_content=data)