def process_result(self, data):
        results = {}
        for shypo in data.xpath("//SHYPO"):
            transcript = []
            confidence = 0.0
            for whypo in shypo.xpath("./WHYPO"):
                word = whypo.attrib["WORD"].encode(self.encoding)
                if word.startswith("<"):
                    continue
                transcript.append(word)
                confidence += float(whypo.attrib["CM"])
            confidence /= len(transcript)
            transcript = " ".join(transcript)
            results[confidence] = transcript

        msg = SpeechRecognitionCandidates()
        debug_str = ["Recognized:"]
        for i, result in enumerate(sorted(results.items(), reverse=True)):
            c, t = result
            debug_str += ["%d: %s (%.2f)" % (i+1, t, c)]
            msg.transcript.append(t)
            msg.confidence.append(c)
        self.pub_speech_recognition.publish(msg)
        self.last_speech = msg
        rospy.logdebug(os.linesep.join(debug_str))
Example #2
0
    def __init__(self):
        # load parameters
        self.encoding = rospy.get_param("~encoding", "utf-8")
        self.default_duration = rospy.get_param("~duration", 3.0)
        self.default_threshold = rospy.get_param("~threshold", 0.8)
        self.use_isolated_word = rospy.get_param("~use_isolated_word", True)
        self.start_signal_action_timeout = rospy.get_param(
            "~start_signal_action_timeout", 0.3)

        self.pub_speech_recognition = rospy.Publisher(
            "speech_to_text", SpeechRecognitionCandidates, queue_size=1)

        # initialize sound play
        self.act_sound = actionlib.SimpleActionClient("sound_play",
                                                      SoundRequestAction)
        if not self.act_sound.wait_for_server(rospy.Duration(5.0)):
            rospy.logwarn(
                "Failed to find sound_play action. Disabled audio alert")
            self.act_sound = None

        # setup julius
        host = rospy.get_param("~host", "localhost")
        module_port = rospy.get_param("~module_port", 10500)
        audio_port = rospy.get_param("~audio_port", 10501)
        max_retry = rospy.get_param("~max_connection_retry", 0)

        self.module = ModuleClient(host, module_port, max_retry, self.encoding)
        self.audio = AudioTransport(host, audio_port, max_retry, "audio")

        rospy.on_shutdown(self.shutdown_cb)
        self.module.on_received_data(self.julius_cb)

        self.module.start()
        self.audio.start()

        self.status()
        self.start()

        # start subscribe
        self.lock = Lock()
        self.last_speech = SpeechRecognitionCandidates()
        self.vocabularies = {}

        self.srv_show_status = rospy.Service("show_julius_status", Empty,
                                             self.status)

        if self.use_isolated_word:
            self.sub_vocabulary = rospy.Subscriber("vocabulary", Vocabulary,
                                                   self.vocabulary_cb)
        else:
            self.sub_grammar = rospy.Subscriber("grammar", Grammar,
                                                self.grammar_cb)
        self.srv_speech_recognition = rospy.Service("speech_recognition",
                                                    SpeechRecognition,
                                                    self.speech_recognition_cb)
    def speech_recognition_srv_cb(self, req):
        res = SpeechRecognitionResponse()

        duration = req.duration
        if duration <= 0.0:
            duration = self.default_duration

        with self.audio as src:
            if self.dynamic_energy_threshold:
                self.recognizer.adjust_for_ambient_noise(src)
                rospy.loginfo("Set minimum energy threshold to %f" %
                              self.recognizer.energy_threshold)

            if not req.quiet:
                self.play_sound("start", 0.1)

            start_time = rospy.Time.now()
            while (rospy.Time.now() - start_time).to_sec() < duration:
                rospy.loginfo("Waiting for speech...")
                try:
                    audio = self.recognizer.listen(
                        src,
                        timeout=self.listen_timeout,
                        phrase_time_limit=self.phrase_time_limit)
                except SR.WaitTimeoutError as e:
                    rospy.logwarn(e)
                    break
                if not req.quiet:
                    self.play_sound("recognized", 0.05)
                rospy.loginfo("Waiting for result... (Sent %d bytes)" %
                              len(audio.get_raw_data()))

                try:
                    result = self.recognize(audio)
                    rospy.loginfo("Result: %s" % result.encode('utf-8'))
                    if not req.quiet:
                        self.play_sound("success", 0.1)
                    res.result = SpeechRecognitionCandidates(
                        transcript=[result])
                    return res
                except SR.UnknownValueError:
                    if self.dynamic_energy_threshold:
                        self.recognizer.adjust_for_ambient_noise(src)
                        rospy.loginfo("Set minimum energy threshold to %f" %
                                      self.recognizer.energy_threshold)
                except SR.RequestError as e:
                    rospy.logerr("Failed to recognize: %s" % str(e))
                rospy.sleep(0.1)
                if rospy.is_shutdown():
                    break

            # Timeout
            if not req.quiet:
                self.play_sound("timeout", 0.1)
            return res
Example #4
0
    def audio_cb(self, msg):
        if self.is_canceling:
            rospy.loginfo("Speech is cancelled")
            return
        data = SR.AudioData(msg.data, self.sample_rate, self.sample_width)
        try:
            if self.asr_engine == "google_legacy_single_utterance":
                rospy.loginfo("Waiting for result %d" % len(data.get_raw_data()))
                result = self.recognizer.recognize_google(
                    data, language=self.language)
                msg = SpeechRecognitionCandidates(transcript=[result])
                rospy.loginfo("Got result: %s" % msg)
                self.pub_speech.publish(msg)

            elif self.asr_engine == "google_cloud_single_utterance":
                rospy.loginfo("Waiting for result %d" % len(data.get_raw_data()))
                result = self.recognizer.recognize_google_cloud(
                    data, language=self.language)
                msg = SpeechRecognitionCandidates(transcript=[result])
                rospy.loginfo("Got result {}: {}".format(len(data.get_raw_data()), msg))
                self.pub_speech.publish(msg)

            elif self.asr_engine == "ibm_single_utterance":
                rospy.loginfo("Waiting for result %d" % len(data.get_raw_data()))
                result = self.recognizer.recognize_ibm_single_utterance(
                    data, 
                    key=os.environ['IBM_API'],
                    language=self.language
                    )
                    #url=os.environ['IBM_URL']) #do not forget /v1/recognize ending
                msg = SpeechRecognitionCandidates(transcript=[result])
                rospy.loginfo("Got result {}: {}".format(len(data.get_raw_data()), msg))
                self.pub_speech.publish(msg)

            else:
                rospy.logerr("No valid Recognizer was set (in the roslaunch file): %s" % self.asr_engine)

        except SR.UnknownValueError as e:
            rospy.logerr("Failed to recognize: %s" % str(e))
        except SR.RequestError as e:
            rospy.logerr("Failed to recognize: %s" % str(e))
 def audio_cb(self, msg):
     if self.is_canceling:
         rospy.loginfo("Speech is cancelled")
         return
     data = SR.AudioData(msg.data, self.sample_rate, self.sample_width)
     try:
         rospy.loginfo("Waiting for result %d" % len(data.get_raw_data()))
         result = self.recognizer.recognize_google(data,
                                                   language=self.language)
         msg = SpeechRecognitionCandidates(transcript=[result])
         self.pub_speech.publish(msg)
     except SR.UnknownValueError as e:
         rospy.logerr("Failed to recognize: %s" % str(e))
     except SR.RequestError as e:
         rospy.logerr("Failed to recognize: %s" % str(e))
 def audio_cb(self, _, audio):
     try:
         rospy.logdebug("Waiting for result... (Sent %d bytes)" %
                        len(audio.get_raw_data()))
         result = self.recognize(audio)
         rospy.loginfo("Result: %s" % result.encode('utf-8'))
         msg = SpeechRecognitionCandidates(transcript=[result])
         self.pub.publish(msg)
     except SR.UnknownValueError as e:
         if self.dynamic_energy_threshold:
             self.recognizer.adjust_for_ambient_noise(self.audio)
             rospy.loginfo("Updated energy threshold to %f" %
                           self.recognizer.energy_threshold)
     except SR.RequestError as e:
         rospy.logerr("Failed to recognize: %s" % str(e))
    def speech_recognition_cb(self, req):
        res = SpeechRecognitionResponse()

        # change grammar
        candidate_words = []
        if req.grammar_name:
            ok = self.activate_gram(req.grammar_name)
            if not ok:
                rospy.logerr("failed to activate grammar %s" % req.grammar_name)
                return res
            if req.grammar_name in self.vocabularies:
                candidate_words = self.vocabularies[req.grammar_name]
        elif req.grammar.rules:
            g = req.grammar
            if not g.name:
                g.name = 'unknown'
            grammar = make_grammar_from_rules(g.rules)
            voca = make_voca_from_categories(g.categories, g.vocabularies)
            result = make_dfa(grammar, voca)
            if result is None:
                rospy.logerr("Failed to make dfa from grammar message")
                return res
            dfa, dic = result
            ok = self.change_gram(g.name, dfa.split(os.linesep), dic.split(os.linesep))
            if not ok:
                rospy.logerr("Failed to change grammar")
                return res
            self.vocabularies[g.name] = list(set(e for v in msg.vocabularies for e in v.words))
            candidate_words = self.vocabularies[g.name]
        elif req.vocabulary.words:
            v = req.vocabulary
            if not v.name:
                v.name = 'unknown'
            if len(v.phonemes) == 0 or not v.phonemes[0]:
                v.phonemes = make_phonemes_from_words(v.words)
            dic = [" %s\t%s" % (w, p) for w, p in zip(v.words, v.phonemes)]
            ok = self.change_gram(v.name, None, dic)
            if not ok:
                rospy.logerr("Failed to change vocabulary")
                return res
            self.vocabularies[v.name] = list(set(v.words))
            candidate_words = self.vocabularies[v.name]
        else:
            rospy.logerr("Invalid request: 'grammar_name', 'grammar' or 'vocabulary' must be filled")
            return res

        duration = req.duration
        if duration <= 0.0:
            duration = self.default_duration

        threshold = req.threshold
        if threshold <= 0.0 or threshold > 1.0:
            threshold = self.default_threshold

        if not req.quiet:
            self.play_sound(self.start_signal, self.start_signal_action_timeout)
        start_time = rospy.Time.now()
        self.last_speech = SpeechRecognitionCandidates()
        while (rospy.Time.now() - start_time).to_sec() < duration:
            rospy.sleep(0.1)
            speech = self.last_speech
            if not self.last_speech.transcript:
                continue
            if candidate_words:
                ok = speech.transcript[0] in candidate_words and speech.confidence[0] >= threshold
            else:
                ok = speech.confidence[0] >= threshold
            if ok:
                t0 = speech.transcript[0]
                c0 = speech.confidence[0]
                rospy.loginfo("Recognized %s (%f)..." % (t0, c0))
                if not req.quiet:
                    self.play_sound(self.success_signal, 0.1)
                res.result = speech
                return res

        # timeout
        rospy.logerr("Timed out")
        if not req.quiet:
            self.play_sound(self.timeout_signal, 0.1)
        return res
 def androidCallback(self, msg):
     self.pub.publish(SpeechRecognitionCandidates(transcript=msg.texts))
Example #9
0
#!/usr/bin/env python

import rospy

from std_msgs.msg import String
from speech_recognition_msgs.msg import SpeechRecognitionCandidates

if __name__ == "__main__":

    text = SpeechRecognitionCandidates()
    text.transcript.append("th\u1EDDi ti\u1EBFt")
    text.confidence.append(1)

    rospy.init_node("fake_speech")
    speech = rospy.Publisher('/speech_to_text',
                             SpeechRecognitionCandidates,
                             queue_size=2)

    rate = rospy.Rate(10)

    while not rospy.is_shutdown():
        speech.publish(text)
        rate.sleep()