Example #1
0
def speech_to_text(palabras=[], *args):
    import os
    import pyaudio
    import json

    if not os.path.exists("model"):
        print(
            "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
        )
        exit(1)

    model = Model("model")
    rec = KaldiRecognizer(model, 16000)

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=8000)
    stream.start_stream()
    sw = 0
    word = "Palabra no encontrada"
    cont = 1
    while sw == 0:
        cont = cont + 1
        print(cont)
        data = stream.read(10000, exception_on_overflow=False)
        if len(data) == 0:
            print("breaking process")
            break
        if rec.AcceptWaveform(data):
            print("result ->" + json.loads(rec.Result())['text'])
            print("result json ->" + rec.Result())
            word = json.loads(rec.Result())['text']
        else:
            print("PartialResult ->" +
                  json.loads(rec.PartialResult())['partial'])
            print("PartialResult json ->" + rec.PartialResult())
            word = json.loads(rec.Result())['text']

        for init in palabras:
            if (word == init):
                sw = 1

    print("palabra aceptada")
    stream.stop_stream()
    # stream.close()
    p.terminate()

    return word
Example #2
0
def audio_to_txt(file_name):

    os.system(f'ffmpeg -i {file_name} out.wav')

    model = Model("model")

    # Large vocabulary free form recognition
    rec = KaldiRecognizer(model, 16000)

    wf = wave.open('out.wav', "rb")

    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
    ) != "NONE":
        print("Audio file must be WAV format mono PCM.")
        exit(1)
    rec = KaldiRecognizer(model, wf.getframerate())
    transcript = ''
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            json_dict = json.loads(rec.Result())
            transcript += json_dict['text']
        else:
            rec.PartialResult()
    os.system(f'rm out.wav')
    return transcript
def recognition():
    if not os.path.exists("model"):
        print(
            "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
        )
        exit(1)

    wf = wave.open(sys.argv[1], "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
    ) != "NONE":
        print("Audio file must be WAV format mono PCM.")
        exit(1)

    model = Model("model")

    # You can also specify the possible word or phrase list as JSON list, the order doesn't have to be strict
    rec = KaldiRecognizer(
        model, wf.getframerate(),
        '["oh one two three four five six seven eight nine zero", "[unk]"]')

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            print(rec.Result())
        else:
            print(rec.PartialResult())

    print(rec.FinalResult())
Example #4
0
def translate_file(filename="last5.wav"):
    SetLogLevel(0)

    if not os.path.exists("model"):
        print(
            "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
        )
        exit(1)
    filepath = "./" + filename
    wf = wave.open(filepath, "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
    ) != "NONE":
        print("Audio file must be WAV format mono PCM.")
        exit(1)

    model = Model("./model")
    rec = KaldiRecognizer(model, wf.getframerate())

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            print(rec.Result())
        else:
            print(rec.PartialResult())
    results = rec.FinalResult()
    return json.loads(results)[
        "text"]  #["results"] for confidence of each word
Example #5
0
class VoskVoiceToTextCalculator(Calculator):

    def __init__(self, name, s, options=None):
        from vosk import Model, KaldiRecognizer
        super().__init__(name, s, options)
        self.model = Model("model")
        self.rec = KaldiRecognizer(self.model, 16000)
        self.output_data = [None, None]

    def process(self):
        audio = self.get(0)
        if isinstance(audio, AudioData):
            if self.rec.AcceptWaveform(audio.audio):
                result = self.rec.Result()
                try:
                    result_json = json.loads(result)
                except json.decoder.JSONDecodeError as e:
                    print("Voice2Text: Failed to parse voice json:", e)
                    print(result)
                else:
                    if 'text' in result_json:
                        text = result_json['text']
                        if text:
                            print("Voice2Text:", repr(text), result_json)
                            self.set_output(0, VoiceTextData(text, audio.timestamp, info=result_json))
            else:
                partial_result = self.rec.PartialResult()
                partial_json = json.loads(partial_result)
                if 'partial' in partial_json:
                    text = partial_json['partial']
                    if text:
                        print("Voice2Text (partial): ", repr(text))
                        self.set_output(1, VoiceTextData(text, audio.timestamp, info=partial_json))
            return True
        return False
Example #6
0
def speech_to_text(args):
    if not os.path.exists(os.path.join('models', args.model)):
        print(
            "Please download the model from https://alphacephei.com/vosk/models and unpack to 'models' folder.")
        exit(1)

    for filepath in glob.iglob(os.path.join(os.getcwd(), args.data, '*.wav')):
        print(filepath)

        wf = wave.open(args.data, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            exit(1)

        model = Model(args.model)
        rec = KaldiRecognizer(model, wf.getframerate())

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                print(rec.Result())
            else:
                print(rec.PartialResult())

        print(rec.FinalResult())

        hypothesis_path = os.path.join(args.hypothesis, filepath.split('.')[0] + '.txt')
        with open(hypothesis_path, 'w') as hypothesis:
            hypothesis.write(rec.FinalResult())
Example #7
0
def listen(model: Model,
           spk_model: SpkModel = None,
           speech_chunk_sec: float = 0.5,
           buffer_sec: float = 1):
    with ExitStack() as stack:
        rate = model.SampleFrequency()
        if spk_model:
            rec = KaldiRecognizer(model, spk_model, rate)
        else:
            rec = KaldiRecognizer(model, rate)
        p = stack.enter_context(_pyaudio())
        s = stack.enter_context(
            _pyaudio_open_stream(p,
                                 format=paInt16,
                                 channels=1,
                                 rate=rate,
                                 input=True,
                                 frames_per_buffer=int(rate * buffer_sec)))
        while True:
            data = s.read(int(rate * speech_chunk_sec))
            if rec.AcceptWaveform(data):
                res = json.loads(rec.Result())
                logging.info(res)
            else:
                res = json.loads(rec.PartialResult())
                logging.info(res)
Example #8
0
async def processVoice(waveChunk, recognizer: KaldiRecognizer):
    """ Recognize audio chunk and process with terminal.onText() """
    signature = None
    text = ''
    final = False
    try:
        final = recognizer.AcceptWaveform(waveChunk)

        if final:  # Фраза распознана полностью
            j = json.loads(recognizer.FinalResult())
            # Получить распознанный текст
            text = str(j['text']).strip() if 'text' in j else ''
        else:
            # Получить распознанный текст
            j = json.loads(recognizer.PartialResult())
            text = str(j['partial']).strip() if 'partial' in j else ''

        # Попытаться извлечь сигнатуру голоса:
        signature = j["spk"] if 'spk' in j else []
    except KeyboardInterrupt as e:
        onCtrlC()
        raise e
    except Exception as e:
        logError(f'Exception processing phrase chunk : {e}')
    return (final, text, signature)
Example #9
0
def transcribe(path: str) -> str:
    """Transcribe."""
    # check if the models is already present
    if not _download_model():
        raise ValueError("Unable to automatically download the model.")
        exit(1)

    wf = wave.open(path, "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
        logger.info("Audio file must be WAV format mono PCM.")
        exit(1)

    model = Model(MODEL_PATH)
    rec = KaldiRecognizer(model, wf.getframerate())

    while True:
        data = wf.readframes(wf.getnframes())
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            full_result = rec.Result()
            text = json.loads(full_result)["text"]
        else:
            partial_result = rec.PartialResult()
            text = json.loads(partial_result)["partial"]
    return text
class WakeWordDetector:
    """唤醒词检测器,对 `vosk-api <https://github.com/alphacep/vosk-api>`_ 的简单封装,默认的唤醒词是 `'阿Q'` 和 `'R-Cute'`。

    如果要自定义唤醒词,请参考 https://github.com/alphacep/vosk-api/blob/master/python/example/test_words.py
    """
    def __init__(
        self,
        sr=16000,
        lang='en',
        grammar='[ "a b c d e f g h i j k l m n o p q r s t u v w x y z key cute", "[unk]" ]'
    ):
        self.load(lang)
        self._det = KaldiRecognizer(util.cache[f'vosk.{lang}'], sr, grammar)

    def _detected(self, text):
        if text == 'r q':
            return '阿Q'
        elif text == 'r cute':
            return 'R-Cute'

    def load(self, lang='en'):
        """load language model in advance"""
        model = util.cache.get(f'vosk.{lang}',
                               Model(util.data_file(f'vosk/{lang}')))
        util.cache[f'vosk.{lang}'] = model

    def detect(self, source, timeout=None):
        """开始检测

        :param source: 声音来源
        :param timeout: 超时,即检测的最长时间(秒),默认为 `None` ,表示不设置超时,知道检测到唤醒词才返回
        :type timeout: float, optional
        :return: 检测到的唤醒词模型对应的唤醒词,若超时没检测到唤醒词则返回 `None`
        :rtype: str
        """
        self._cancel = False  # possible race condition?
        if timeout:
            count = 0.0
        self._det.FinalResult()  # clear buffer
        while True:
            segment = source.read()
            if self._det.AcceptWaveform(segment.raw_data):
                p = self._detected(json.loads(self._det.Result())['text'])
            else:
                p = self._detected(
                    json.loads(self._det.PartialResult())['partial'])
            if p:
                return p
            if self._cancel:
                return
                # raise RuntimeError('Hotword detection cancelled by another thread')
            elif timeout:
                count += segment.duration_seconds
                if count > timeout:
                    return  # self._detected(self._det.FinalResult()['text'])

    def cancel(self):
        """停止检测"""
        self._cancel = True
Example #11
0
    def mic_listen(self):
        text = ""
        open_stream = True
        p = pyaudio.PyAudio()
        while True:
            if open_stream:
                self.publish("gui/boomer/task", "opening stream...")
                stream = p.open(format=pyaudio.paInt16,
                                channels=1,
                                rate=16000,
                                input=True,
                                frames_per_buffer=8000)
                stream.start_stream()
                open_stream = False

            elif len(text) != 0:
                try:
                    stream.stop_stream()
                except Exception:
                    pass
                return text
            rec = KaldiRecognizer(self.sr_model, 16000)
            start_time = time.time()
            self.publish("gui/boomer/eye", "0")
            while True:
                try:
                    data = stream.read(4000)
                    if len(data) == 0:
                        break

                    elif time.time() - start_time > 7:
                        break
                    else:
                        rec.AcceptWaveform(data)
                        output = json.loads(rec.PartialResult())
                        output = output["partial"]
                        self.debug_msg("Kaldi_r", "Output: " + output)
                        if self.name in output:
                            self.publish("gui/boomer/eye", "1")

                        if self.name not in output and len(output) > 7:
                            break

                        elif self.name in output and ' '.join(
                                str(output).split()[1:len(output) -
                                                    1]) in self.keywords:
                            self.debug_msg("Kaldi_R", "got keyword")
                            text = output
                            break

                except Exception as e:
                    self.publish("gui/boomer/task", str(e))
                    try:
                        stream.stop_stream()
                    except Exception:
                        pass
                    open_stream = True
                    break
Example #12
0
 def StreamingRecognize(self, request_iterator, context):
     request = next(request_iterator)
     partial = request.config.specification.partial_results
     recognizer = KaldiRecognizer(self.model, request.config.specification.sample_rate_hertz)
     for request in request_iterator:
         res = recognizer.AcceptWaveform(request.audio_content)
         if res:
             yield self.get_response(recognizer.Result())
         elif partial:
             yield self.get_response(recognizer.PartialResult())
     yield self.get_response(recognizer.FinalResult())
class Recognizer:
    def __init__(self, pathToModel):
        self.answer = "None"
        self.modelFlag = False
        self.pyAudioFlag = False
        self.pathToModel = pathToModel

    def setupModel(self):
        if not os.path.exists(self.pathToModel):
            print(
                "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder."
            )
            exit(1)
        self.model = Model(self.pathToModel)
        self.rec = KaldiRecognizer(self.model, 16000)
        self.modelFlag = True
        self.startPyaudio()

    def startPyaudio(self):
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=pyaudio.paInt16,
                                  channels=1,
                                  rate=16000,
                                  input=True,
                                  frames_per_buffer=8000)
        self.stream.start_stream()
        self.pyAudioFlag = True

    def stopPyaudio(self):
        self.stream.stop_stream()
        self.pyAudioFlag = False

    def runTimedRecognition(self):
        n = 1000
        while True:
            data = self.stream.read(4000)
            if len(data) == 0:
                break
            if self.rec.AcceptWaveform(data):
                result = self.rec.Result()
                print(result)
                d = json.loads(str(result))
                myStr = d["text"]
                print(myStr)
                self.answer = myStr
                return myStr
            else:
                print(self.rec.PartialResult())
            if n == 0:
                break
            print(n)
            n -= 1
def creat_text_gpu(path):
    wf = wave.open(path.replace('.wav', '_mono.wav'), "rb")
    rec = KaldiRecognizer(model, wf.getframerate())

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            pass
        else:
            rec.PartialResult()

    write_file(parse_json(rec.FinalResult()), path.split('/')[-1].replace('.wav', ''))
Example #15
0
    def StreamingRecognize(self, request_iterator, context):
        request = next(request_iterator)
        partial = request.config.specification.partial_results
        recognizer = KaldiRecognizer(
            self.model, request.config.specification.sample_rate_hertz)
        recognizer.SetMaxAlternatives(
            request.config.specification.max_alternatives)
        recognizer.SetWords(
            request.config.specification.enable_word_time_offsets)

        for request in request_iterator:
            res = recognizer.AcceptWaveform(request.audio_content)
            if res:
                yield self.get_response(recognizer.Result())
            elif partial:
                yield self.get_response(recognizer.PartialResult())
        yield self.get_response(recognizer.FinalResult())
Example #16
0
def offline_record_recognize_audio():
    model = Model(r"D:\pythonProject1\models")  # полный путь к модели
    rec = KaldiRecognizer(model, 8000)
    p = pyaudio.PyAudio()
    stream = p.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=8000,
        input=True,
        frames_per_buffer=8000
    )
    stream.start_stream()

    while True:
        data = stream.read(4000)
        if len(data) == 0:
            break

        print(rec.Result() if rec.AcceptWaveform(data) else rec.PartialResult())

    print(rec.FinalResult())
def Speech2Text():
    wf = wave.open("recording.wav", "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
    ) != "NONE":
        print("Audio file must be WAV format mono PCM.")
        exit(1)

    model = Model("model1")
    # You can also specify the possible word list
    rec = KaldiRecognizer(model, wf.getframerate(), "money purse police shoot")

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            print(rec.Result())
        else:
            print(rec.PartialResult())
    res = json.loads(rec.FinalResult())
    print("Speech2Text: " + res['text'])
    return res['text']
Example #18
0
def vosk_model(address):
    SetLogLevel(2)

    wf = wave.open(address, "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
    ) != "NONE":
        print("Audio file must be WAV format mono PCM.")
        exit(1)

    model = Model("../audio_utils/tests/vosk_test/model")
    rec = KaldiRecognizer(model, wf.getframerate())

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            print(rec.Result())
        else:
            print(rec.PartialResult())

    print(rec.FinalResult())
Example #19
0
def recognize_file(filepath):
    wf = wave.open(filepath, "rb")
    print("press_f")
    print(wf.getnchannels())
    print(wf.getsampwidth())
    print(wf.getcomptype())
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
    ) != "NONE":
        print("Audio file must be WAV format mono PCM.", file=sys.stderr)
    rec = KaldiRecognizer(MODEL, wf.getframerate())
    recognition = ""
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            recognition = recognition + json.loads(rec.Result())["text"] + " "
        else:
            rec.PartialResult()

    recognition += json.loads(rec.FinalResult())["text"]

    return recognition
Example #20
0
from vosk import Model, KaldiRecognizer, SetLogLevel
import sys
import os
import wave
import json

SetLogLevel(0)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

model = Model(lang="en-us")
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetMaxAlternatives(10)
rec.SetWords(True)

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print(json.loads(rec.Result()))
    else:
        print(json.loads(rec.PartialResult()))

print(json.loads(rec.FinalResult()))
Example #21
0
def trigger_microphone(n_clicks):
    if n_clicks == 0:
        return ''
    print('trigger microphone %d' % n_clicks)
    import termux
    termux.Microphone.stop()
    pwd = os.environ['PWD']
    aac_file = "%s/microphone.aac" % pwd
    wave_file = "%s/microphone.wave" % pwd
    if os.path.exists(aac_file):
        os.remove(aac_file)
    termux.Microphone.record(aac_file, encoder='aac', limit=5, count=2)
    import time
    time.sleep(6)
    os.system('faad -o %s %s' % (wave_file, aac_file))
    if False:
        import speech_recognition as sr
        r = sr.Recognizer()
        with sr.WavFile(wave_file) as source:
            audio = r.record(source)
        text = r.recognize_sphinx(audio)
    else:
        from vosk import Model, KaldiRecognizer, SetLogLevel
        import wave
        import numpy as np
        model_name = 'vosk-model-small-en-us-0.15'
        if not os.path.exists(model_name):
            os.system('wget http://alphacephei.com/vosk/models/%s.zip' %
                      model_name)
            os.system('unzip %s.zip' % model_name)
        wf = wave.open(wave_file, "rb")
        model = Model(model_name)
        rec = KaldiRecognizer(model, wf.getframerate())
        nch = wf.getnchannels()
        depth = wf.getsampwidth()
        typ = {1: np.uint8, 2: np.uint16, 4: np.uint32}.get(depth)
        sdata = wf.readframes(64000)
        data = np.frombuffer(sdata, dtype=typ)
        ch_data = data[0::nch]
        sdata = ch_data.tobytes()
        if True:
            outwav = wave.open('good.wave', 'w')
            outwav.setparams(wf.getparams())
            outwav.setnchannels(1)
            outwav.writeframes(ch_data.tobytes())
            outwav.close()

        if rec.AcceptWaveform(sdata):
            result = rec.Result()
            result = json.loads(result)
            text = result['text']
        else:
            result = rec.PartialResult()
            result = json.loads(result)
            text = result['partial']
        result = rec.FinalResult()
        result = json.loads(result)
        text += result['text']
    print('finish microphone')
    print('text:%s' % text)
    return text
Example #22
0
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=16000,
                input=True,
                frames_per_buffer=8000)
stream.start_stream()

while True:
    data = stream.read(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        text = json.loads(rec.Result())['text']
        print(text)
        tmp = dict.get(text)
        try:
            if tmp:
                os.system(tmp)
        except BaseException:
            print("хуй тебе")

#os.system(dict[tmp])

    else:
        #print(rec.PartialResult().split('"partial" : "'))
        print(rec.PartialResult())

print(rec.FinalResult())
Example #23
0
        recognizedResults = voskSpeechRecognitionEngine.Result()
        print(recognizedResults)

        # Prepare recognized text to send
        parsedRecognizedResults = recognizedResults.split(
            '"text" : "')[1].split('"')[0]

        # Send results with yarp port
        outputBottle.clear()
        outputBottle.addString("Recognized: " + str(parsedRecognizedResults))
        voskSpeechRecognition_outputPort.write(outputBottle)

    # If detect and recognize parcial results
    else:
        # Print partial results
        recognizedPartialResults = voskSpeechRecognitionEngine.PartialResult()
        print(recognizedPartialResults)

# Close YARP ports
print("[INFO] Closing YARP ports ...")
voskSpeechRecognition_inputPort.close()
voskSpeechRecognition_outputPort.close()

print("")
print("")
print(
    "**************************************************************************"
)
print("Program finished")
print(
    "**************************************************************************"
Example #24
0
class Decoder:
    def __init__(self, info):
        model = Model(os.getcwd() + "/modules/model")
        self.rec = KaldiRecognizer(model, 8000)
        self.ip, self.port = info["front"]

    def decode_file(self, aud_file):
        SetLogLevel(0)
        sentence = ""
        results = ""
        confidence = 0
        tot = 0

        wf = wave.open(aud_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
        ) != "NONE":  #checking certain file characteristics
            print("Audio aud_file must be WAV format mono PCM.")
            exit(1)

        while True:  #loop for doing voice recognition
            data = wf.readframes(4000)
            if len(data) == 0:  #done reading audio file
                break
            if self.rec.AcceptWaveform(
                    data):  #finished recognition on segment of audio file
                items = self.rec.Result()
                results = json.loads(items)
                if len(results.items(
                )) > 1:  #false recognition, sometimes nothing is detected
                    for i in results["result"]:
                        confidence += i["conf"]
                        tot += 1
                    sentence = sentence + " " + results["text"]
                else:
                    print(self.rec.PartialResult())
        f_res = json.loads(self.rec.FinalResult())
        if len(f_res.items()) > 1:
            return f_res["text"]
        wf.close()
        if tot > 0 and confidence / tot > .8:  #checking confidence of recognition
            return sentence.lower().strip()
        elif tot > 0:
            print("confidence too low: " + str(confidence / tot))
        return ""

    def listen_stream(self):
        HOST = self.ip
        PORT = self.port
        CHUNK = 32768
        TIMEOUT = 10

        while True:
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                totData = 0
                connDied = False

                ret = self.try_connection(HOST, PORT, s, "send CNRDY")
                if ret == False:
                    s.close()
                    continue
                print("connected")
                s.sendall(b"CNRDY\0")  #sending connection ready
                data = b""
                s.settimeout(2)
                while b"YEETO" not in data:  #getting rid of bad data
                    try:
                        data = s.recv(CHUNK)
                        print("bad data : {}".format(len(data)))
                        if len(data) == 0:
                            print("conn died during handshake")
                            time.sleep(2)
                            connDied = True
                            break

                    except:
                        print("timed out from connection and didn't get YEETO")
                        connDied = True
                        break
                if connDied:
                    continue
                s.settimeout(None)
                s.sendall(
                    b"FLUSH\0")  #letting front know bad data has been flushed
                FTOT, FTEMP = self.init_temp_tot_wave(
                )  #init FTOT and FTEMP files
                while True:
                    temp = self.open_temp_wave(FTEMP)  #get temorary wave file
                    try:
                        data = s.recv(CHUNK)
                    except:
                        print("connection with {} {} died".format(HOST, PORT))
                        connDied = True
                        break
                    size = len(data)
                    totData += size
                    if data == None or size == 0:  #check for when we
                        #receive packets of zero size
                        print("connection from front-end closed")
                        print(f"FRONT CLOSE tot data received : {totData}")
                        break
                    print(f"got data: {len(data)}")
                    temp.writeframesraw(data)
                    temp.close()
                    self.combine_files([FTOT, FTEMP])
                    #combining wave file data
                    if (self.detect_silence(FTOT)):
                        #2 seconds of silence detected
                        break
                if connDied:
                    break
            try:
                s.close()
                print(f"BACK CLOSE tot data received : {totData}")
                if totData != 0:  #we got zero data from the connection
                    self.send_gdata()
                    break
            except BrokenPipeError:
                print(f"connection died with {HOST} port {PORT}")

        results = self.decode_file(FTOT)  #get results from file
        print("FINAL RESULT from stream: " + results)
        return results

    def clear_socket(self):  #prototype for clearing socket data
        HOST = self.ip
        PORT = self.port

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            self.try_connection(HOST, PORT, sock, "CLEAR SOCKET")
            sock.settimeout(TIMEOUT)  # 10 second timeout
            size = 1
            while size > 0:
                sock.recv(1024)  #just receive data and throw it away
            sock.close()

    def send_cnerr(self):
        HOST = self.ip
        PORT = self.port

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            print("sending connection error")
            self.try_connection(HOST, PORT, sock, "SEND CNERR")
            sock.sendall(b"CNERR\0")
            sock.close()

    def send_gdata(self):
        HOST = self.ip
        PORT = self.port

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            print("sending good data")
            self.try_connection(HOST, PORT, sock, "SEND GDATA")
            sock.sendall(b"GDATA\0")
            sock.close()

    def init_temp_tot_wave(self):
        FTOT = "./temp/recv.wav"
        FTEMP = "./temp/temp.wav"

        tot = wave.open(FTOT, 'wb')
        tot.setnchannels(1)  #mono
        tot.setsampwidth(2)
        tot.setframerate(8000)
        tot.close()

        temp = wave.open(FTEMP, 'wb')
        temp.setnchannels(1)  #mono
        temp.setsampwidth(2)
        temp.setframerate(8000)
        temp.close()
        return FTOT, FTEMP

    def open_temp_wave(self, FTEMP):
        temp = wave.open(FTEMP, 'wb')
        temp.setnchannels(1)  #mono
        temp.setsampwidth(2)
        temp.setframerate(8000)
        return temp

    def try_connection(self, HOST, PORT, s, funcName):
        print("trying to connect " + HOST + " " + str(PORT))
        print(f"{funcName} connecting to front-end")
        time.sleep(2)
        s.settimeout(5)
        try:
            s.connect((HOST, PORT))
            s.settimeout(None)
            return True
        except ConnectionRefusedError:
            print("connection to {} on port {} refused.".format(HOST, PORT))
            print("will try again in 5 seconds\n")
            time.sleep(5)
            return False
        except OSError:
            print("couldn't find {} on port {}".format(HOST, PORT))
            print("wil try again in 5 seconds")
            time.sleep(5)
            return False
        except TimeoutError:
            print("connection timed out for {} port {}".format(HOST, PORT))
            print("will try again in 5 seconds\n")
            time.sleep(5)
            return False

    def send_mstop(self):
        HOST = self.ip
        PORT = self.port

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            print("sending MSTOP")
            while True:
                try:
                    sock.connect((HOST, PORT))
                    break
                except ConnectionRefusedError:
                    print("connection to {} on port {} refused.".format(
                        HOST, PORT))
                    print("will try again in 5 seconds\n")
                    time.sleep(5)
                except OSError:
                    print("couldn't find {} on port {}".format(HOST, PORT))
                    print("wil try again in 5 seconds")
                    time.sleep(5)
            sock.sendall(b"MSTOP\0")
            sock.close()

    def combine_files(self, files):
        data = []

        for infile in files:
            w = wave.open(infile, "rb")
            data.append([w.readframes(w.getnframes())])
            w.close()

        output = wave.open(files[0], "wb")
        output.setnchannels(1)  #mono
        output.setsampwidth(2)
        output.setframerate(8000)
        output.writeframes(data[0][0])
        output.writeframes(data[1][0])
        output.close()

    def detect_silence(self, fileName):
        myaudio = intro = AudioSegment.from_wav(fileName)
        dBFS = myaudio.dBFS
        print(dBFS)
        pieces = silence.detect_silence(myaudio, 1000, dBFS - 0)
        pieces = [((start / 1000), (stop / 1000))
                  for start, stop in pieces]  #convert to sec

        for i in pieces:
            if i[1] - i[0] > 3:
                print("big silence: " + str(i[0]) + " " + str(i[1]))
                return True
        return False
Example #25
0
class Tester:

    def __init__(
            self,
            filepath: Optional[str],
            model_path: str,
            sample_rate: int,
            use_gpu: bool = False
    ):
        if use_gpu:
            # Gpu part, uncomment if vosk-api has gpu support
            from vosk import GpuInit, GpuInstantiate
            GpuInit()
            GpuInstantiate()

        self.sample_rate = sample_rate
        self.model = Model(model_path)
        self.rec = KaldiRecognizer(self.model, sample_rate)

        self.filepath = filepath

    def _read(self, out):

        while True:
            data = out.read(8000)

            if len(data) == 0:
                break

            if self.rec.AcceptWaveform(data):
                print(self.rec.Result())
            else:
                print(self.rec.PartialResult())

        print(self.rec.FinalResult())

    def _test_microphone(self):

        stream = PyAudio().open(
            format=paInt16,
            channels=1,
            rate=self.sample_rate,
            input=True,
            frames_per_buffer=8000
        )

        stream.start_stream()

        self._read(stream)

    def _test_file(self, filepath):
        process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i',
                                    filepath, '-ar', str(self.sample_rate),
                                    '-ac', '1', '-f', 's16le', '-'],
                                   stdout=subprocess.PIPE)

        self._read(process.stdout)

    def test(self):
        if self.filepath is None:
            self._test_microphone()
        else:
            self._test_file(self.filepath)
Example #26
0
        recognized_words = 0
        recognition_report.length = 0
        audio_report.write("\nTranscription:\n")

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                json_dict = json.loads(rec.Result())
                transcript = json_dict['text']
                count_word = recognition_report()
                recognized_words += count_word
            else:
                rec.PartialResult()

        total_recognized_words += recognized_words
        current_audio_time = time.time() - start_time
        total_time += current_audio_time

        audio_report.write("\n\nThe processing time of the audio file: ")
        audio_report.write("{}\n".format(time.strftime("%M:%S", time.gmtime(current_audio_time))))
        audio_report.write("Number of recognized words: {}".format(recognized_words))

        os.remove('audio.wav')

average_audio = total_time // number_audio

# Creating the final report
main_report = open(REPORT_PATH + '.txt', 'w')
Example #27
0
from vosk import Model, KaldiRecognizer
import sys
import json
import os

if not os.path.exists("model"):
    print(
        "Please download the model from https://github.com/alphacep/kaldi-android-demo/releases and unpack as 'model' in the current folder."
    )
    exit(1)

model = Model("model")
rec = KaldiRecognizer(model, 16000)

wf = open(sys.argv[1], "rb")
wf.read(44)  # skip header

while True:
    data = wf.read(2000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        res = json.loads(rec.Result())
        print(res)
    else:
        res = json.loads(rec.PartialResult())
        print(res)

res = json.loads(rec.FinalResult())
print(res)
Example #28
0
    print("model at " + model_path + " found succesfully")

wf = wave.open(sys.argv[2], "rb")

if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

model = Model(model_path)
rec = KaldiRecognizer(model, wf.getframerate(),
                      '["red", "green", "blue", "yellow", "white", "[unk]"]')

with open('full_result.json',
          'w') as full_result_file, open('partial_result.json',
                                         'w') as partial_result_file:
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = rec.Result()
            full_result_file.write(result)
            check_color(json.loads(result))
        else:
            partial_result_file.write(rec.PartialResult())

    result = rec.FinalResult()
    full_result_file.write(result)
    check_color(json.loads(result))
Example #29
0
def gen_subparts(input_file,
                 model_dir,
                 verbose=False,
                 partlen=4,
                 progress=False):
    SetLogLevel(0 if verbose else -1)

    model = Model(model_dir)
    rec = KaldiRecognizer(model, 16000)

    process = subprocess.Popen([
        'ffmpeg', '-loglevel', 'quiet', '-i', input_file, '-ar',
        str(16000), '-ac', '1', '-f', 's16le', '-'
    ],
                               stdout=subprocess.PIPE)

    r = subprocess.run(
        "ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1"
        .split() + [input_file],
        stdout=subprocess.PIPE)
    duration = float(r.stdout.decode('utf-8').strip())

    if progress:
        pbar = tqdm(total=duration, unit="s")

    prev_end = 0
    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            r = json.loads(rec.Result())
            if 'result' in r:
                resultpart = []  # TODO: use this across AccesptForm
                for result in r['result']:
                    if len(resultpart) > 0 and float(result['end']) - float(
                            resultpart[0]['start']) >= partlen:
                        yield SubPart(start=resultpart[0]['start'],
                                      end=float(resultpart[-1]['end']),
                                      text=" ".join(r['word']
                                                    for r in resultpart))
                        prev_end = float(resultpart[-1]['end'])
                        resultpart = []
                    if float(result['end'] - result['start']) >= partlen:
                        yield SubPart(start=float(result['start']),
                                      end=float(result['end']),
                                      text=result['word'])
                        prev_end = float(result['end'])
                        resultpart = []
                    else:
                        resultpart.append(result)
                    if progress:
                        pbar.update(float(result['end'] - pbar.n))

                if len(resultpart) > 0:
                    yield SubPart(start=float(resultpart[0]['start']),
                                  end=float(resultpart[-1]['end']),
                                  text=" ".join(r['word'] for r in resultpart))
                    prev_end = float(resultpart[-1]['end'])
                    resultpart = []

        else:
            pass
            #print(rec.PartialResult())
    #pprint(rec.PartialResult())
    if progress:
        pbar.close()
    r = json.loads(rec.PartialResult())
    text = r['partial']
    yield SubPart(start=prev_end, end=duration, text=text)
Example #30
0
class VoskInput(BaseInput):
    """ Uses the `vosk` package to do speech recognition. """
    def __init__(self):
        super(VoskInput, self).__init__()
        self.current_utterance = ""
        self.realtime = True  # indicates that audio can be streamed in

        model_name = crystal.core.get_config(
            'vosk_model') or 'vosk-model-small-en-us-0.3'
        log.info(f"Using vosk model: {model_name}")
        self.model = Model(f"models/{model_name}")
        self.rec = None
        self.__final_result = None

    def process_audio(self, raw_audio: bytes, sample_rate: int,
                      sample_width: int):
        if not self.rec:
            self.rec = KaldiRecognizer(self.model, sample_rate)
        full = self.rec.AcceptWaveform(raw_audio)
        if full:
            result = self.rec.Result()
        else:
            result = self.rec.PartialResult()
        log.debug(result)
        result = json.loads(result)
        if "result" in result:
            self.__final_result = result
        if "text" in result:
            text = result["text"]
        elif "partial" in result:
            text = result["partial"]
        if text:
            self.current_utterance = text
        return self.current_utterance

    def get_full_result(self):
        if self.__final_result:
            result = self.__final_result
        else:
            result = self.rec.FinalResult()
            result = json.loads(result)
        log.debug(result)
        self.rec = None
        self.current_utterance = ""
        self.__final_result = None

        full_text = result["text"]
        # HACK: auto correct text to match domain vocabulary. Sorry.
        full_text = full_text.replace("palace music", "pause music")
        full_text = full_text.replace("applause music", "pause music")
        if any(x in full_text
               for x in ["turn on", "turn off", "turned on", "turned off"]):
            full_text = full_text.replace("the land", "the lamp").replace(
                "the lamb", "the lamp")
            if full_text.endswith("the lam"):
                full_text = full_text.replace("the lam", "the lamp")
        if any(x in full_text for x in ["timer", "alarm"]):
            full_text = full_text.replace("crystal said", "crystal set")
        if full_text.endswith("to pm"):
            full_text = full_text.replace("to pm", "2 pm")
        elif full_text.endswith(" a m"):
            full_text = full_text.replace(" a m", " am")
        if full_text.startswith("christo"):
            full_text = full_text.replace("christo", "crystal")
        elif full_text.startswith("crews to"):
            full_text = full_text.replace("crews to", "crystal")
        elif full_text.startswith("christian"):
            full_text = full_text.replace("christian", "crystal")

        return full_text