def transcribe(self,
                   audio,
                   language="en-US",
                   show_all=False,
                   metrics=None):
        timer = Stopwatch()
        timer.start()
        identity = IdentityManager().get()
        headers = {}
        if identity.token:
            headers['Authorization'] = 'Bearer %s:%s' % (identity.device_id,
                                                         identity.token)

        response = requests.post(config.get("proxy_host") +
                                 "/stt/google_v2?language=%s&version=%s" %
                                 (language, self.version),
                                 audio.get_flac_data(),
                                 headers=headers)

        if metrics:
            t = timer.stop()
            metrics.timer("mycroft.cerberus.proxy.client.time_s", t)
            metrics.timer("mycroft.stt.remote.time_s", t)

        if response.status_code == 401:
            raise CerberusAccessDenied()

        try:
            actual_result = response.json()
        except:
            raise UnknownValueError()

        log.info("STT JSON: " + json.dumps(actual_result))
        if show_all:
            return actual_result

        # return the best guess
        if "alternative" not in actual_result:
            raise UnknownValueError()
        alternatives = actual_result["alternative"]
        if len([alt for alt in alternatives if alt.get('confidence')]) > 0:
            # if there is at least one element with confidence, force it to
            # the front
            alternatives.sort(key=lambda e: e.get('confidence', 0.0),
                              reverse=True)

        for entry in alternatives:
            if "transcript" in entry:
                return entry["transcript"]

        if len(alternatives) > 0:
            log.error("Found %d entries, but none with a transcript." %
                      len(alternatives))

        # no transcriptions available
        raise UnknownValueError()
Exemple #2
0
    def recognize_cloudasr(self,
                           audio_data,
                           language='en-wiki',
                           show_all=False):
        """
        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the CloudASR API.

        TODO: what languages?
        The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. A list of supported language codes can be found `here <http://stackoverflow.com/questions/14257598/>`__. Basically, language codes can be just the language (``en``), or a language with a dialect (``en-US``).
        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the key isn't valid, the quota for the key is maxed out, or there is no internet connection.
        """

        assert isinstance(audio_data,
                          AudioData), "`audio_data` must be audio data"
        assert isinstance(language, str), "`language` must be a string"

        convert_rate = 16000
        audio_data = audio_data.get_wav_data(convert_rate=convert_rate)

        url = 'https://api.cloudasr.com/recognize?lang={}'.format(language)
        request = Request(url,
                          data=audio_data,
                          headers={
                              "Content-Type":
                              "audio/x-wav; rate={0};".format(convert_rate)
                          })

        try:
            response = urlopen(request)
        except HTTPError as e:
            raise RequestError("recognition request failed: {0}".format(
                getattr(e, "reason", "status {0}".format(
                    e.code))))  # use getattr to be compatible with Python 2.6
        except URLError as e:
            raise RequestError("recognition connection failed: {0}".format(
                e.reason))
        response_text = response.read().decode("utf-8")

        # ignore any blank blocks
        actual_result = []
        for line in response_text.split("\n"):
            if not line: continue
            result = json.loads(line)["result"]
            if len(result) != 0:
                actual_result = result[0]
                break

        # return results
        if show_all: return actual_result
        if "alternative" not in actual_result: raise UnknownValueError()
        for entry in actual_result["alternative"]:
            if "transcript" in entry:
                return entry["transcript"]
        raise UnknownValueError()  # no transcriptions available
    def recognize(self, audio_data, keyword_entries=None, grammar=None):
        language = self.lang
        assert isinstance(audio_data,
                          AudioData), "``audio_data`` must be audio data"
        assert isinstance(language, str), "``language`` must be a string"
        assert keyword_entries is None or all(
            isinstance(keyword,
                       (type(""), type(u""))) and 0 <= sensitivity <= 1
            for keyword, sensitivity in
            keyword_entries), "``keyword_entries`` must be ``None`` or" \
                              " a list of pairs of strings and " \
                              "numbers between 0 and 1"

        # obtain audio data
        raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2)
        # obtain recognition results
        if keyword_entries is not None:  # explicitly specified set of keywords
            with PortableNamedTemporaryFile("w") as f:
                # generate a keywords file
                f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity -
                                                  110)
                             for keyword, sensitivity in keyword_entries)
                f.flush()

                # perform the speech recognition with the keywords file
                self.decoder.set_kws("keywords", f.name)
                self.decoder.set_search("keywords")
                self.decoder.start_utt()  # begin utterance processing
                self.decoder.process_raw(raw_data, False, True)
                self.decoder.end_utt()  # stop utterance processing
        elif grammar is not None:  # a path to a FSG or JSGF grammar
            if not os.path.exists(grammar):
                raise ValueError(
                    "Grammar '{0}' does not exist.".format(grammar))
            grammar_path = os.path.abspath(os.path.dirname(grammar))
            grammar_name = os.path.splitext(os.path.basename(grammar))[0]
            fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
            if not os.path.exists(
                    fsg_path):  # create FSG grammar if not available
                jsgf = Jsgf(grammar)
                rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
                fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5)
                fsg.writefile(fsg_path)
            else:
                fsg = FsgModel(fsg_path, self.decoder.get_logmath(), 7.5)
            self.decoder.set_fsg(grammar_name, fsg)
            self.decoder.set_search(grammar_name)
            self.decoder.start_utt()
            self.decoder.process_raw(raw_data, False, True)
            self.decoder.end_utt()  # stop utterance processing
        else:  # no keywords, perform freeform recognition
            self.decoder.start_utt()  # begin utterance processing
            self.decoder.process_raw(raw_data, False, True)
            self.decoder.end_utt()  # stop utterance processing

        # return results
        hypothesis = self.decoder.hyp()
        if hypothesis is not None:
            return hypothesis.hypstr
        raise UnknownValueError()  # no transcriptions available
def listen():
    transcript = None
    with mic as source:
        while transcript is None:
            try:
                r.adjust_for_ambient_noise(source)
                audio = r.listen(source)
                text_options = r.recognize_google(audio, show_all=True)
                if len(text_options) == 0:
                    raise UnknownValueError()

                if len(text_options['alternative']) > 1:
                    print("")
                    print(
                        "I am confused between below options but I will go with first one:"
                    )
                    print("")
                    i = 0
                    for option in text_options['alternative']:
                        print(i + 1, ". ", option['transcript'])
                        i += 1
                    print("")

                transcript = text_options['alternative'][0]['transcript']
                print(" >> " + transcript)

            except UnknownValueError:
                print(
                    "Sorry, could not recognize what you have said, say any word or sentence"
                )

    return transcript
Exemple #5
0
    def recognize_google(self,
                         audio_data,
                         key=None,
                         language="en-US",
                         show_all=False):
        """
        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.

        The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.

        To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".

        The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.

        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.

        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
        """
        assert isinstance(audio_data,
                          AudioData), "``audio_data`` must be audio data"
        assert key is None or isinstance(
            key, str), "``key`` must be ``None`` or a string"
        assert isinstance(language, str), "``language`` must be a string"

        flac_data = audio_data.get_flac_data(
            convert_rate=None if audio_data.sample_rate >= 8000 else
            8000,  # audio samples must be at least 8 kHz
            convert_width=2  # audio samples must be 16-bit
        )
        if key is None: key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
        url = "http://www.google.com/speech-api/v2/recognize?{}".format(
            urlencode({
                "client": "chromium",
                "lang": language,
                "key": key,
            }))
        request = Request(url,
                          data=flac_data,
                          headers={
                              "Content-Type":
                              "audio/x-flac; rate={}".format(
                                  audio_data.sample_rate)
                          })

        # obtain audio transcription results
        try:
            response = urlopen(request, timeout=self.operation_timeout)
        except HTTPError as e:
            raise RequestError("recognition request failed: {}".format(
                e.reason))
        except URLError as e:
            raise RequestError("recognition connection failed: {}".format(
                e.reason))
        response_text = response.read().decode("utf-8")

        # ignore any blank blocks
        actual_result = []
        for line in response_text.split("\n"):
            if not line: continue
            result = json.loads(line)["result"]
            if len(result) != 0:
                actual_result = result[0]
                break

        # return results
        if show_all: return actual_result
        # if not isinstance(actual_result, dict) or len(actual_result.get("alternative", [])) == 0: raise UnknownValueError()
        if not isinstance(actual_result, dict) or len(
                actual_result.get("alternative", [])) == 0:
            return 0

        if "confidence" in actual_result["alternative"]:
            # return alternative with highest confidence score
            best_hypothesis = max(
                actual_result["alternative"],
                key=lambda alternative: alternative["confidence"])
        else:
            # when there is no confidence available, we arbitrarily choose the first hypothesis.
            best_hypothesis = actual_result["alternative"][0]
        if "transcript" not in best_hypothesis: raise UnknownValueError()
        return best_hypothesis["transcript"]
Exemple #6
0
def my_recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None,
                           show_all=False):
    """
    Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API.
    This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials.
    The path to this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials> (remember to define GOOGLE_APPLICATION_CREDENTIALS environment variable)`__.
    The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation <https://cloud.google.com/speech/docs/languages>`__.
    If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
    Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
    Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
    """
    assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
    assert isinstance(language, str), "``language`` must be a string"
    assert preferred_phrases is None or all(
        isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in
        preferred_phrases), "``preferred_phrases`` must be a list of strings"

    # See https://cloud.google.com/speech/reference/rest/v1/RecognitionConfig
    flac_data = audio_data.get_flac_data(
        convert_rate=None if 8000 <= audio_data.sample_rate <= 48000 else max(8000, min(audio_data.sample_rate, 48000)),
        # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
        convert_width=2  # audio samples must be 16-bit
    )

    try:
        #from oauth2client.client import GoogleCredentials
        from googleapiclient.discovery import build
        import googleapiclient.errors
        import google.auth
        from google.oauth2 import service_account
        # cannot simply use 'http = httplib2.Http(timeout=self.operation_timeout)'
        # because discovery.build() says 'Arguments http and credentials are mutually exclusive'
        import socket
        import googleapiclient.http
        if self.operation_timeout and socket.getdefaulttimeout() is None:
            # override constant (used by googleapiclient.http.build_http())
            googleapiclient.http.DEFAULT_HTTP_TIMEOUT_SEC = self.operation_timeout

        if credentials_json is None:
            api_credentials = google.auth.default()
        else:
            api_credentials = service_account.Credentials.from_service_account_file(credentials_json)
            # the credentials can only be read from a file, so we'll make a temp file and write in the contents to work around that
            #with PortableNamedTemporaryFile("w") as f:
            #    f.write(credentials_json)
            #    f.flush()
            #    api_credentials = GoogleCredentials.from_stream(f.name)

        speech_service = build("speech", "v1", credentials=api_credentials, cache_discovery=False)
    except ImportError:
        raise RequestError(
            "missing google-api-python-client module: ensure that google-api-python-client is set up correctly.")

    speech_config = {"encoding": "FLAC", "sampleRateHertz": audio_data.sample_rate, "languageCode": language}
    if preferred_phrases is not None:
        speech_config["speechContexts"] = [{"phrases": preferred_phrases}]
    if show_all:
        speech_config["enableWordTimeOffsets"] = True  # some useful extra options for when we want all the output
    request = speech_service.speech().recognize(
        body={"audio": {"content": base64.b64encode(flac_data).decode("utf8")}, "config": speech_config})

    try:
        response = request.execute()
    except googleapiclient.errors.HttpError as e:
        raise RequestError(e)
    except URLError as e:
        raise RequestError("recognition connection failed: {0}".format(e.reason))

    if show_all: return response
    if "results" not in response or len(response["results"]) == 0: raise UnknownValueError()
    transcript = ""
    averageConfidence = 0
    numberOfTranscripts = 0
    for result in response["results"]:
        transcript += result["alternatives"][0]["transcript"].strip() + " "
        averageConfidence += result["alternatives"][0]["confidence"]
        numberOfTranscripts += 1

    averageConfidence /= numberOfTranscripts
    return {
        'transcript': transcript,
        'confidence': averageConfidence
    }