Python KaldiRecognizer.AcceptWaveformの例、vosk.KaldiRecognizer.AcceptWaveform Pythonの例

コード例 #1

0

ファイルを表示

class Recognizer:
    def __init__(self, model: Model):
        self._rec = KaldiRecognizer(model, 16000)
        self._tpye_mappimg = {"wav": "wav", "mpeg": "mp3"}

    def __del__(self):
        self._rec = None

    def recognize(self, contents: bytes) -> str:

        if self._rec.AcceptWaveform(contents):
            pass

        return json.loads(self._rec.Result())

    def format_normalize(self, file: File, type: str = "wav") -> bytes:
        audio = AudioSegment.from_file(file, self._tpye_mappimg[type])
        audio = audio.set_frame_rate(16000)

        buf = io.BytesIO()
        audio.export(buf, format="wav")

        return buf.getvalue()

コード例 #2

0

ファイルを表示

ファイル: main.py プロジェクト: KashtanovAlex/KaldiVoiceRecognitionMicroservice

def use_offline_recognition():
    recognized_data = ""

    if not os.path.exists("models/vosk-model-small-ru-0.4"):
        print("Please download the model from:\n"
                  "https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
        exit(1)

    wave_audio_file = wave.open("microphone-results.wav", "rb")
    model = Model("models/vosk-model-small-ru-0.4")
    offline_recognizer = KaldiRecognizer(model, wave_audio_file.getframerate())

    data = wave_audio_file.readframes(wave_audio_file.getnframes())
    if len(data) > 0:
        if offline_recognizer.AcceptWaveform(data):
            recognized_data = offline_recognizer.Result()


            recognized_data = json.loads(recognized_data)
            recognized_data = recognized_data["text"]

    print(recognized_data)
    return recognized_data

コード例 #3

0

ファイルを表示

ファイル: workmod.py プロジェクト: Heatkliff/helper_maxiel

def stt_ru_offline():
    model = Model("model")
    rec = KaldiRecognizer(model, 16000)

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=8000)
    stream.start_stream()

    while True:
        data = stream.read(4000, exception_on_overflow=False)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            x = json.loads(rec.Result())
            if len(x["text"]):
                return x["text"]
        else:
            # print(rec.PartialResult())
            pass

コード例 #4

0

ファイルを表示

def recognize(model, wav_file_path):
    """
    Speech to text recognizer for russian speech using vosk models
    path to russian vosk model should be configured in config.py file
    """

    with wave.open(wav_file_path, "rb") as wf:
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            raise TypeError("Audio file must be WAV format mono PCM.")

        rec = KaldiRecognizer(model, wf.getframerate())

        while True:
            data = wf.readframes(4000)

            if len(data) == 0:
                break

            rec.AcceptWaveform(data)

        json_ = json.loads(rec.FinalResult())

        return json_['text']

コード例 #5

0

ファイルを表示

def transcribe(audio_path, output_path, model_id="vosk-model-en-in-0.4"):
    """
  
  :param audio_path: 
  :param model_id: 
  (Get models from https://alphacephei.com/vosk/models and extract in vosk_models folder.)
  :return: 
  """
    model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              "vosk_models", model_id)
    sample_rate = 16000
    model = Model(model_path)
    rec = KaldiRecognizer(model, sample_rate)

    process = subprocess.Popen([
        'ffmpeg', '-loglevel', 'quiet', '-i', audio_path, '-ar',
        str(sample_rate), '-ac', '1', '-f', 's16le', '-'
    ],
                               stdout=subprocess.PIPE)

    text = ""
    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            # print(rec.Result())
            result = json.loads(rec.Result())
            text = "%s %s" % (text, result["text"])
        else:
            # print(rec.PartialResult())
            pass

    print(rec.FinalResult())
    MdFile(file_path=output_path).dump_to_file(metadata={},
                                               content=text,
                                               dry_run=False)

コード例 #6

0

ファイルを表示

def driver():  # enqueues stuff
    if not os.path.exists("model"):
        print(
            "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current "
            "folder.")
        exit(1)

    model = Model("model")
    rec = KaldiRecognizer(model, 16000)

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=8000)
    stream.start_stream()

    while True:
        data = stream.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            inputQueue.put(rec.Result())

コード例 #7

0

ファイルを表示

ファイル: vosk_transcriber.py プロジェクト: rmazzine/AudioTranscriberAPI

def transcribe_wav(filepath):

    wf = wave.open(filepath, 'rb')

    n_channels = wf.getnchannels()

    # If not mono, convert it to mono
    if n_channels != 1:
        mono_filename = f'{filepath}.monofile.wav'
        mono = wave.open(mono_filename, 'wb')
        mono.setparams(wf.getparams())
        mono.setnchannels(1)
        mono.writeframes(
            audioop.tomono(wf.readframes(float('inf')), wf.getsampwidth(), 1,
                           1))
        mono.close()

        wf = wave.open(mono_filename, 'rb')

        os.remove(mono_filename)

    rec = KaldiRecognizer(model, wf.getframerate())

    transcription = ''
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            transcription += str(eval(rec.Result())['text']) + ' '

    transcription += str(eval(rec.FinalResult())['text'])

    os.remove(filepath)

    return transcription

コード例 #8

0

ファイルを表示

ファイル: predictor.py プロジェクト: shifthex/fastDeploy

def run_asr(f):
    try:
        wf = f + str(uuid.uuid4()) + ".wav"
        if MAX_WAV_LEN:
            os.system(
                f"ffmpeg -hide_banner -loglevel panic -n -i {shlex.quote(f)} -ss 0 -t {MAX_WAV_LEN}  -ar {SAMPLE_RATE} -ac 1 {shlex.quote(wf)}"
            )
        else:
            os.system(
                f"ffmpeg -hide_banner -loglevel panic -n -i {shlex.quote(f)} -ar {SAMPLE_RATE} -ac 1 {shlex.quote(wf)}"
            )

        o_wf = wave.open(wf, "rb")
        data = o_wf.readframes(o_wf.getnframes())
        o_wf.close()
        os.remove(wf)

        rec = KaldiRecognizer(model, SAMPLE_RATE)

        rec.AcceptWaveform(data)

        return json.loads(rec.FinalResult())
    except Exception as ex:
        return {"error": str(ex)}

コード例 #9

0

ファイルを表示

ファイル: run.py プロジェクト: shirosweets/linto-platform-stt-standalone-worker

def transcribe():
    try:
        worker.log.info('[%s] New user entry on /transcribe' %
                        (strftime("%d/%b/%d %H:%M:%S", gmtime())))

        is_metadata = False

        # get response content type
        if request.headers.get('accept').lower() == 'application/json':
            is_metadata = True
        elif request.headers.get('accept').lower() == 'text/plain':
            is_metadata = False
        else:
            raise ValueError('Not accepted header')

        # get input file
        if 'file' in request.files.keys():
            file = request.files['file']
            worker.getAudio(file)
            rec = KaldiRecognizer(model, spkModel, worker.rate, worker.ONLINE)
            rec.AcceptWaveform(worker.data)
            data_ = rec.FinalResult()
            confidence = rec.uttConfidence()
            if is_metadata:
                data_ = rec.GetMetadata()
            data = worker.get_response(data_, confidence, is_metadata)
            worker.clean()
        else:
            raise ValueError('No audio file was uploaded')

        return data, 200
    except ValueError as error:
        return str(error), 400
    except Exception as e:
        worker.log.error(e)
        return 'Server Error', 500

コード例 #10

0

ファイルを表示

def myCommand():
    # "listens for commands"
    # We imported vosk up above.
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
    stream.start_stream()
    model = Model("model-en")
    rec = KaldiRecognizer(model, 16000)
    while True:
        data = stream.read(2000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
        #print(rec.Result())
        # I commented out this line and added the 3 lines below
            myResult = rec.Result()
            myList = myResult.split("text")
            command = myList[1]
            stream.stop_stream()
            stream.close()
            p.terminate()
            return command
######## END STT SPEECH TO TEXT FUNCTION THAT RETURNS THE VARIABLE: command
###############################################################################################

コード例 #11

0

ファイルを表示

def transcribe(file_name):
    sound = AudioSegment.from_wav(file_name)
    sound = sound.set_channels(1)
    sound.export("generate/audio.wav", format="wav")

    SetLogLevel(-1)

    if not os.path.exists("model"):
        print(
            "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
        )
        exit(1)

    wf = wave.open('generate/audio.wav', "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
    ) != "NONE":
        print("Audio file must be WAV format mono PCM.")
        exit(1)

    model = Model("model")
    rec = KaldiRecognizer(model, wf.getframerate())

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            pass
            # print(rec.Result())
        else:
            pass
            # print(rec.PartialResult())

    r = rec.FinalResult()
    r = json.loads(r)
    return r['text']

コード例 #12

0

ファイルを表示

def my_link():
    time.sleep(15)
    c = '0' + '.wav'
    counter = 0
    while (c in os.listdir(audio_path)):
        #for i in os.listdir(audio_path):
        #if i.endswith('.wav'):
        #sound = AudioSegment.from_wav('C:/Users/admin/Downloads/' +  i )
        #sound = sound.set_channels(1) # To make it MONO Channel
        #sound = sound.set_frame_rate(44100) # Sample Frame rate taken here = 44,100 Hz
        #sound.export('C:/Users/admin/Downloads/' + i , format="wav")

        wf = wave.open(audio_path + '/' + c, 'rb')

        model = Model("vosk-model-small-en-in-0.4")

        rec = KaldiRecognizer(model, wf.getframerate())

        while True:
            data = wf.readframes(CHUNK)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                pass

        dict = ast.literal_eval(
            rec.FinalResult())  #changing the string to dictionary

        print(c)
        print(dict["text"])
        l.append(dict["text"])
        time.sleep(0.01)
        counter = counter + 1
        c = '0' + ' ' + '(' + str(counter) + ')' + '.wav'
    #for i in l:
    return render_template('index.html')

コード例 #13

0

ファイルを表示

ファイル: speechToText.py プロジェクト: adrianchavez123/telegram-processing-speech-audios

    def _recognize_vosk(self):
        SetLogLevel(0)
        if not os.path.exists("vosk-model-small-es-0.3"):
            raise Exception("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
        text = []
        wf = wave.open(self.file_name, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            raise Exception("Audio file must be WAV format mono PCM.")
        model = Model("vosk-model-small-es-0.3")
        rec = KaldiRecognizer(model, wf.getframerate())
        rec.SetWords(True)

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            rec.AcceptWaveform(data)

        try:
            res = json.loads(rec.FinalResult())
            return res['text']
        except Exception as e:
            print(e)
            return ""

コード例 #14

0

ファイルを表示

def upload_voicecar(request):
    if request.method == "POST":
        myFile = request.FILES.get("myfile", None)
        if not myFile:
            print("no files for upload!")
            return HttpResponse("no files for upload!")
        destination = open(os.path.join("media/voice", myFile.name), 'wb+')
        for chunk in myFile.chunks():
            destination.write(chunk)
        destination.close()

        rec = KaldiRecognizer(vosk_model, 16000)
        wf = wave.open(BASE_DIR + '/media/voice/voicecar.wav', "rb")

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                rec.Result()

        data = json.loads(rec.FinalResult())
        voicetext = data['text']

        print(voicetext)

        qian = dragon_cf['voicerec']['qian'].split(',')
        yizhiqian = dragon_cf['voicerec']['yizhiqian'].split(',')
        hou = dragon_cf['voicerec']['hou'].split(',')
        yizhihou = dragon_cf['voicerec']['yizhihou'].split(',')
        zuo = dragon_cf['voicerec']['zuo'].split(',')
        you = dragon_cf['voicerec']['you'].split(',')
        ting = dragon_cf['voicerec']['ting'].split(',')

        if voicetext in qian:
            print("qianqianqian")
            robot.forward(0.5)
            time.sleep(0.1)
            robot.stop()
        elif voicetext in yizhiqian:
            print("yizhiqian")
            robot.forward(0.5)
        elif voicetext in hou:
            print("hou")
            robot.backward(0.5)
            time.sleep(0.1)
            robot.stop()
        elif voicetext in yizhihou:
            print("yizhihou")
            robot.backward(0.5)
        elif voicetext in zuo:
            print("zuo")
            robot.left(0.5)
            time.sleep(0.1)
            robot.stop()
        elif voicetext in you:
            print("you")
            robot.right(0.5)
            time.sleep(0.1)
            robot.stop()
        elif voicetext in ting:
            print("tingtingting")
            robot.stop()

        return HttpResponse("upload over!")

コード例 #15

0

ファイルを表示

ファイル: vosk_rec.py プロジェクト: JJGEEX/DOORS-Framework

class Decoder:
    def __init__(self, info):
        model = Model(os.getcwd() + "/modules/model")
        self.rec = KaldiRecognizer(model, 8000)
        self.ip, self.port = info["front"]

    def decode_file(self, aud_file):
        SetLogLevel(0)
        sentence = ""
        results = ""
        confidence = 0
        tot = 0

        wf = wave.open(aud_file, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
        ) != "NONE":  #checking certain file characteristics
            print("Audio aud_file must be WAV format mono PCM.")
            exit(1)

        while True:  #loop for doing voice recognition
            data = wf.readframes(4000)
            if len(data) == 0:  #done reading audio file
                break
            if self.rec.AcceptWaveform(
                    data):  #finished recognition on segment of audio file
                items = self.rec.Result()
                results = json.loads(items)
                if len(results.items(
                )) > 1:  #false recognition, sometimes nothing is detected
                    for i in results["result"]:
                        confidence += i["conf"]
                        tot += 1
                    sentence = sentence + " " + results["text"]
                else:
                    print(self.rec.PartialResult())
        f_res = json.loads(self.rec.FinalResult())
        if len(f_res.items()) > 1:
            return f_res["text"]
        wf.close()
        if tot > 0 and confidence / tot > .8:  #checking confidence of recognition
            return sentence.lower().strip()
        elif tot > 0:
            print("confidence too low: " + str(confidence / tot))
        return ""

    def listen_stream(self):
        HOST = self.ip
        PORT = self.port
        CHUNK = 32768
        TIMEOUT = 10

        while True:
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                totData = 0
                connDied = False

                ret = self.try_connection(HOST, PORT, s, "send CNRDY")
                if ret == False:
                    s.close()
                    continue
                print("connected")
                s.sendall(b"CNRDY\0")  #sending connection ready
                data = b""
                s.settimeout(2)
                while b"YEETO" not in data:  #getting rid of bad data
                    try:
                        data = s.recv(CHUNK)
                        print("bad data : {}".format(len(data)))
                        if len(data) == 0:
                            print("conn died during handshake")
                            time.sleep(2)
                            connDied = True
                            break

                    except:
                        print("timed out from connection and didn't get YEETO")
                        connDied = True
                        break
                if connDied:
                    continue
                s.settimeout(None)
                s.sendall(
                    b"FLUSH\0")  #letting front know bad data has been flushed
                FTOT, FTEMP = self.init_temp_tot_wave(
                )  #init FTOT and FTEMP files
                while True:
                    temp = self.open_temp_wave(FTEMP)  #get temorary wave file
                    try:
                        data = s.recv(CHUNK)
                    except:
                        print("connection with {} {} died".format(HOST, PORT))
                        connDied = True
                        break
                    size = len(data)
                    totData += size
                    if data == None or size == 0:  #check for when we
                        #receive packets of zero size
                        print("connection from front-end closed")
                        print(f"FRONT CLOSE tot data received : {totData}")
                        break
                    print(f"got data: {len(data)}")
                    temp.writeframesraw(data)
                    temp.close()
                    self.combine_files([FTOT, FTEMP])
                    #combining wave file data
                    if (self.detect_silence(FTOT)):
                        #2 seconds of silence detected
                        break
                if connDied:
                    break
            try:
                s.close()
                print(f"BACK CLOSE tot data received : {totData}")
                if totData != 0:  #we got zero data from the connection
                    self.send_gdata()
                    break
            except BrokenPipeError:
                print(f"connection died with {HOST} port {PORT}")

        results = self.decode_file(FTOT)  #get results from file
        print("FINAL RESULT from stream: " + results)
        return results

    def clear_socket(self):  #prototype for clearing socket data
        HOST = self.ip
        PORT = self.port

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            self.try_connection(HOST, PORT, sock, "CLEAR SOCKET")
            sock.settimeout(TIMEOUT)  # 10 second timeout
            size = 1
            while size > 0:
                sock.recv(1024)  #just receive data and throw it away
            sock.close()

    def send_cnerr(self):
        HOST = self.ip
        PORT = self.port

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            print("sending connection error")
            self.try_connection(HOST, PORT, sock, "SEND CNERR")
            sock.sendall(b"CNERR\0")
            sock.close()

    def send_gdata(self):
        HOST = self.ip
        PORT = self.port

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            print("sending good data")
            self.try_connection(HOST, PORT, sock, "SEND GDATA")
            sock.sendall(b"GDATA\0")
            sock.close()

    def init_temp_tot_wave(self):
        FTOT = "./temp/recv.wav"
        FTEMP = "./temp/temp.wav"

        tot = wave.open(FTOT, 'wb')
        tot.setnchannels(1)  #mono
        tot.setsampwidth(2)
        tot.setframerate(8000)
        tot.close()

        temp = wave.open(FTEMP, 'wb')
        temp.setnchannels(1)  #mono
        temp.setsampwidth(2)
        temp.setframerate(8000)
        temp.close()
        return FTOT, FTEMP

    def open_temp_wave(self, FTEMP):
        temp = wave.open(FTEMP, 'wb')
        temp.setnchannels(1)  #mono
        temp.setsampwidth(2)
        temp.setframerate(8000)
        return temp

    def try_connection(self, HOST, PORT, s, funcName):
        print("trying to connect " + HOST + " " + str(PORT))
        print(f"{funcName} connecting to front-end")
        time.sleep(2)
        s.settimeout(5)
        try:
            s.connect((HOST, PORT))
            s.settimeout(None)
            return True
        except ConnectionRefusedError:
            print("connection to {} on port {} refused.".format(HOST, PORT))
            print("will try again in 5 seconds\n")
            time.sleep(5)
            return False
        except OSError:
            print("couldn't find {} on port {}".format(HOST, PORT))
            print("wil try again in 5 seconds")
            time.sleep(5)
            return False
        except TimeoutError:
            print("connection timed out for {} port {}".format(HOST, PORT))
            print("will try again in 5 seconds\n")
            time.sleep(5)
            return False

    def send_mstop(self):
        HOST = self.ip
        PORT = self.port

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            print("sending MSTOP")
            while True:
                try:
                    sock.connect((HOST, PORT))
                    break
                except ConnectionRefusedError:
                    print("connection to {} on port {} refused.".format(
                        HOST, PORT))
                    print("will try again in 5 seconds\n")
                    time.sleep(5)
                except OSError:
                    print("couldn't find {} on port {}".format(HOST, PORT))
                    print("wil try again in 5 seconds")
                    time.sleep(5)
            sock.sendall(b"MSTOP\0")
            sock.close()

    def combine_files(self, files):
        data = []

        for infile in files:
            w = wave.open(infile, "rb")
            data.append([w.readframes(w.getnframes())])
            w.close()

        output = wave.open(files[0], "wb")
        output.setnchannels(1)  #mono
        output.setsampwidth(2)
        output.setframerate(8000)
        output.writeframes(data[0][0])
        output.writeframes(data[1][0])
        output.close()

    def detect_silence(self, fileName):
        myaudio = intro = AudioSegment.from_wav(fileName)
        dBFS = myaudio.dBFS
        print(dBFS)
        pieces = silence.detect_silence(myaudio, 1000, dBFS - 0)
        pieces = [((start / 1000), (stop / 1000))
                  for start, stop in pieces]  #convert to sec

        for i in pieces:
            if i[1] - i[0] > 3:
                print("big silence: " + str(i[0]) + " " + str(i[1]))
                return True
        return False

コード例 #16

0

ファイルを表示

def detectKeywords(libpath):

    audio_stream = AudiostreamSource()
    extractor = FeatureExtractor(libpath)
    detector = AudioRecognition(libpath)

    framerate = 16000
    model = Model("model")

    #Let's define a custom dictionary
    rec = KaldiRecognizer(
        model, framerate,
        '["oh one two three four five six seven eight nine zero", "[unk]"]')

    extactor_gain = 1.0

    #Add one or more keyword models
    keywordIdAlexa = detector.addModel(
        '../../models/Hotword/alexa_v3.0.35.premium', 0.85)

    bufsize = detector.getInputDataSize()

    print("Audio Recognition Version: " + detector.getVersionString())

    command_started = False

    audio_stream.start()
    try:
        while (True):
            # Wakeword loop
            if (not command_started):
                frame = audio_stream.read(bufsize * 2, bufsize * 2)
                if (not frame):
                    time.sleep(0.01)
                    continue

                features = extractor.signalToMel(frame, extactor_gain)
                prediction = detector.runDetection(features)
                if (prediction != 0):
                    now = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")
                    if (prediction == keywordIdAlexa):
                        print("Alexa detected:" + now)

                    os.system(play_command + " ../resources/ding.wav")
                    command_started = True
            # vosk loop
            else:
                frame = audio_stream.read(4000, 4000)
                if (not frame):
                    time.sleep(0.01)
                    continue

                if rec.AcceptWaveform(bytes(frame)):
                    print(rec.Result())
                    command_started = False
                    print(rec.FinalResult())

    except KeyboardInterrupt:
        print("Terminating")
        audio_stream.stop()
        sys.exit(0)

コード例 #17

0

ファイルを表示

   ofp.write(json.dumps(transcripts, indent=4))
   
 with open(outCTMFile, 'w') as ofp:
     for transcript in transcripts:
         #print('\t%s (%s-%s-%s)\n' % (transcript['transcription'], sessionId, transcript['utterance_start'], transcript['utterance_duration']))
         for token in transcript["tokens"]:
             ofp.write("%s \t 1 \t %.2f \t %.2f \t %s\n" % (sessionId, token["start"], token["duration"], token["baseform"]))
 print(' ')
 
 # get the list of JSON dictionaries
 results = []
 if(useSegmentsInVosk):
     for segment in tqdm(segments):
         if(len(segment.bytes) == 0):
             continue;
         if(rec.AcceptWaveform(segment.bytes)):
             part_result = json.loads(rec.Result())
             results.append(part_result)
         part_result = json.loads(rec.FinalResult())
         if(part_result):
             results.append(part_result)
 else:
     # recognize speech using vosk model in streaming mode
     wf = wave.open(audioFile, "rb")
     while True:
         data = wf.readframes(4000)
         if len(data) == 0:
             break
         if rec.AcceptWaveform(data):
             part_result = json.loads(rec.Result())
             results.append(part_result)

コード例 #18

0

ファイルを表示

ファイル: anki-voice.py プロジェクト: williamknows/anki-voice

class AnkiSpeechToCommand():
    """ Manages speech-to-text for Anki-related commands."""

    def __init__(self, command_config="commands.json", alert_sound_enabled=True):
        """Constructor for AnkiSpeechToCommand. Initialises vosk speech-to-text module,
        AnkiConnect API handler object, and derives word commands from a JSON file.

        Args:
            command_config (str, optional): Filename for the JSON command file. Defaults to "commands.json".
            alert_sound_enabled (bool, optional): Controls confirmation sound for attach, pause, and unpause commands. Defaults to True.

        Raises:
            json.decoder.JSONDecodeError: Handles decode errors from the JSON command file, such as malformed syntax.
            AnkiVoiceError: Handles anki-voice errors, in particular here for missing command definitions.
        """
        # Verify speech-to-text engine (vosk) model exists
        if not Path(Path(__file__).resolve().parent, "Model").is_dir():
            print("Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' (directory) in the current folder.")
            sys.exit(1)
        # Configure speech-to-text engine
        SetLogLevel(-10)
        self._model = Model("model")
        self._recogniser = KaldiRecognizer(self._model, 16000)
        self._stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1,
                                              rate=16000, input=True, frames_per_buffer=2048)
        self._stream.start_stream()
        # Create AnkiConnect API handler object
        self._anki_action = AnkiActionHandler(
            alert_sound_enabled=alert_sound_enabled)
        # Behaviour configuration
        self._speech_to_text_paused = False
        self._alert_sound_enabled = alert_sound_enabled
        # Parse command JSON configuation
        self.command_config_load(command_config)
        # tba
        self.engine = pyttsx3.init()

    def command_config_load(self, command_config):
        try:
            with open(command_config) as command_config_raw:
                command_config_json = json.load(command_config_raw)
                for command in ["attach", "show", "again", "difficult", "good", "easy", "pause", "unpause", "close", "quit"]:
                    if command not in command_config_json:
                        raise Exception(
                            f"Malformed commands in {command_config}. Missing the command (key): {command}")
                    if command == "attach":
                        self._attach_commands = [
                            command] + command_config_json[command]["related_words"]
                    elif command == "show":
                        self._show_commands = [
                            command] + command_config_json[command]["related_words"]
                    elif command == "again":
                        self._again_commands = [
                            command] + command_config_json[command]["related_words"]
                    elif command == "difficult":
                        self._difficult_commands = [
                            command] + command_config_json[command]["related_words"]
                    elif command == "good":
                        self._good_commands = [
                            command] + command_config_json[command]["related_words"]
                    elif command == "easy":
                        self._easy_commands = [
                            command] + command_config_json[command]["related_words"]
                    elif command == "pause":
                        self._pause_commands = [
                            command] + command_config_json[command]["related_words"]
                    elif command == "unpause":
                        self._unpause_commands = [
                            command] + command_config_json[command]["related_words"]
                    elif command == "close":
                        self._close_commands = [
                            command] + command_config_json[command]["related_words"]
                    elif command == "quit":
                        self._quit_commands = [
                            command] + command_config_json[command]["related_words"]
        except json.decoder.JSONDecodeError as ex:
            logging.error(
                f"A JSON decoder error occured when attempting to obtain Anki command words: {ex}")
            sys.exit(1)
        except AnkiVoiceError as ex:
            logging.error(
                f"An anki-voice error occured: {ex}")
            sys.exit(1)
        except Exception as ex:
            logging.error(
                f"An unknown exception occured when attempting to obtain Anki command words: {ex}")
            sys.exit(1)

    def run(self):
        """Starts thread to handle speech-to-text module functionality."""
        self._command_detection = threading.Thread(
            target=self._cyclic_word_detection)
        self._command_detection.start()

    def pause(self):
        """Pauses speech-to-text monitoring (except for 'unpause' commands)."""
        self._speech_to_text_paused = True
        print("Executed: pause")
        if self._alert_sound_enabled:
            audio_feedback_queue.put_nowait("Success: Paused.")

    def unpause(self):
        """Unpauses speech-to-text monitoring (permitting any commands)."""
        self._speech_to_text_paused = False
        print("Executed: unpause")
        if self._alert_sound_enabled:
            audio_feedback_queue.put_nowait("Success: Unpaused.")

    def quit(self):
        """Triggers exit of anki-voice."""
        print("Executed: quit")
        sys.exit(0)

    def _cyclic_word_detection(self):
        """Loops through audio input and identifies speech to text for possible commands."""
        while True:
            data = self._stream.read(2048, exception_on_overflow=False)
            if len(data) == 0:
                break
            if self._recogniser.AcceptWaveform(data):
                res = json.loads(self._recogniser.Result())
                # Identify sentence blocks
                if "text" in res:
                    detected_words = res["text"].lower()
                    if detected_words != "":
                        self._action_command(detected_words)

    def _action_command(self, detected_words):
        """Analyses speech-to-text strings for anki-voice commands.

        Args:
            detected_words (str): The words identified through speech-to-text analysis.
        """
        # Verify if paused, and if so, only proceed if command is to unpause
        if self._speech_to_text_paused:
            if detected_words not in self._unpause_commands:
                return
        # Process commands
        print("Detected:", detected_words)
        if detected_words in self._attach_commands:
            self._anki_action.get_current_card_information(
                called_through_attach_command=True)
        elif detected_words in self._show_commands:
            self._anki_action.show()
        elif detected_words in self._again_commands:
            self._anki_action.again()
        elif detected_words in self._difficult_commands:
            self._anki_action.difficult()
        elif detected_words in self._good_commands:
            self._anki_action.good()
        elif detected_words in self._easy_commands:
            self._anki_action.easy()
        elif detected_words in self._pause_commands:
            self.pause()
        elif detected_words in self._unpause_commands:
            self.unpause()
        elif detected_words in self._close_commands:
            self._anki_action.close()
        elif detected_words in self._quit_commands:
            self.quit()

    def __del__(self):
        """Destructor for AnkiSpeechToCommand. Stops pyaudio stream used by vosk speech-to-text module.

        Raises:
            AttributeError: Handles situation where "module" folder validation fails in constructor. Not required to be logged.
        """
        try:
            self._stream.stop_stream()
        except AttributeError as ex:
            pass
        except Exception as ex:
            logging.error(
                f"An unknown exception occured when attempting to stop the pyaudio stream for the vosk module: {ex}")

コード例 #19

0

ファイルを表示

ファイル: main.py プロジェクト: juliengabryelewicz/vocal

def main():
    configuration = Configuration("config/config.yaml")

    if not os.path.exists("model/" + configuration.config_list["language"]):
        print(
            "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder."
        )
        exit(1)

    configuration.generate_nlu_file()

    ##HOTWORD
    hotword = Hotword(configuration.config_list["hotword"])

    ##TEXT TO SPEECH
    tts = Tts()
    tts.setVoice(configuration.config_list["voice_id"])

    ##PYAUDIO
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=8000)
    stream.start_stream()

    ##VOSK
    model = Model("model/" + configuration.config_list["language"])
    rec = KaldiRecognizer(model, 16000)

    ###SNIPS
    nlu = Nlu("nlu/" + configuration.config_list["language"] + "/dataset.json")

    # Load plugins
    plugin_directories = [os.path.normpath('plugins')]

    plugins_list = PluginList(plugin_directories)
    plugins_list.find_plugins()

    while True:
        data = stream.read(8000, exception_on_overflow=False)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            rec_result = json.loads(rec.Result())
            if rec_result["text"].count(hotword.getWord()) > 0:
                tts.speak(configuration.config_list["sentence_welcome"])
                hotword.setState(True)
            if hotword.getState() == True:
                if rec_result["text"] != "":
                    parsing = nlu.parse(rec_result["text"])
                    if parsing["intent"][
                            "probability"] >= configuration.config_list[
                                "min_probability"]:
                        for plugin in plugins_list._plugins:
                            plugin_object = plugins_list._plugins[
                                plugin].plugin_class
                            if plugin_object.has_intent(
                                    parsing["intent"]["intentName"]) == True:
                                response = plugin_object.get_response(
                                    parsing["intent"]["intentName"],
                                    parsing["slots"])
                                tts.speak(response)
                                hotword.setState(False)
                    elif parsing["intent"]["intentName"] == None:
                        hotword.setState(True)
                    else:
                        tts.speak(
                            "je ne suis pas sur d'avoir compris, peux-tu répéter?"
                        )

コード例 #20

0

ファイルを表示

class VoskInput(AudioInput):
    """
    Input from Vosk using the given language model.
    """
    def __init__(self,
                 notifier,
                 rate=16000,
                 wav_dir=None,
                 model=os.path.join(_MODEL_DIR, 'model')):
        """
        @see AudioInput.__init__()

        :type  rate:
        :param rate:
            The override for the rate, if not the model's one.
        :type  wav_dir:
        :param wav_dir:
            Where to save the wave files, if anywhere.
        :type  model:
        :param model:
            The path to the Vosk model file.
        """
        # Load in and configure the model.
        if not os.path.exists(model):
            raise IOError("Not found: %s" % (model, ))
        LOG.info("Loading model from %s, this could take a while", model)
        SetLogLevel(1 if LOG.getLogger().getEffectiveLevel() >= 20 else 2)
        self._model = Model(model)
        self._recognizer = KaldiRecognizer(self._model, rate)
        LOG.info("Model loaded")

        # Wen can now init the superclass
        super(VoskInput, self).__init__(notifier,
                                        format=pyaudio.paInt16,
                                        channels=1,
                                        rate=rate,
                                        wav_dir=wav_dir)

        # Where we put the results
        self._results = []

    def _feed_raw(self, data):
        """
        @see AudioInput._feed_raw()
        """
        # Attempt to decode it
        if self._recognizer.AcceptWaveform(data):
            self._add_result(self._recognizer.Result())

    def _decode(self):
        """
        @see AudioInput._decode()
        """
        # Collect anything remaining
        self._add_result(self._recognizer.FinalResult())

        # Ensure it's clear for next time
        self._recognizer.Reset()

        # Tokenize
        tokens = []
        LOG.debug("Decoding: %s" % self._results)
        for result in self._results:
            word = result.get('word', '').strip()
            conf = result.get('conf', 0.0)
            if word and conf:
                tokens.append(Token(word, conf, True))

        # Done
        self._results = []

        # And give them all back
        LOG.debug("Got: %s" % ' '.join(str(i) for i in tokens))
        return tokens

    def _add_result(self, json_result):
        """
        Add in any result we have from the given JSON string.
        """
        result = json.loads(json_result)
        LOG.debug("Got %s" % json_result)

        # See what we got, if anything
        if 'result' in result:
            # A full result, which is the best
            self._results.extend(result['result'])
        elif 'text' in result:
            # A decoded text string
            for word in result['text'].split():
                if word:
                    self._results.append({'word': word, 'conf': 1.0})

コード例 #21

0

ファイルを表示

ファイル: init_server.py プロジェクト: polozovvasya/call_centre_stt_server

    def transcribe_to_sql(self, duration, side, original_file_name, rec_date,
                          src, dst, linkedid):

        trans_start = time.time()  # datetime.datetime.now()

        if self.source_id == self.sources['master']:
            original_file_name = linkedid + ('-in.wav'
                                             if side == 0 else '-out.wav')

        transcribation_date = datetime.datetime.now().strftime(
            '%Y-%m-%dT%H:%M:%S')
        print('transcribing', self.temp_file_path + self.temp_file_name)
        # read file
        wf = wave.open(self.temp_file_path + self.temp_file_name, "rb")

        # read model
        model = Model(self.model_path)
        rec = KaldiRecognizer(model, wf.getframerate())

        # recognizing
        phrases_count = 0

        confidences = []

        while True:

            conf_score = []

            data = wf.readframes(4000)
            if len(data) == 0:
                break

            if rec.AcceptWaveform(data):
                accept = json.loads(rec.Result())
                if accept['text'] != '':

                    accept_start = str(accept['result'][0]['start'])
                    accept_end = accept['result'][-1:][0]['end']
                    accept_text = str(accept['text'])

                    for result_rec in accept['result']:
                        conf_score.append(float(result_rec['conf']))
                    conf_mid = str(sum(conf_score) / len(conf_score))
                    confidences.append(sum(conf_score) / len(conf_score))
                    # conf_score = []

                    self.save_result(duration, accept_text, accept_start,
                                     accept_end, side, transcribation_date,
                                     conf_mid, original_file_name, rec_date,
                                     src, dst, linkedid)

                    phrases_count += 1

        if len(confidences):
            self.confidence_of_file = sum(confidences) / len(confidences)
        else:
            self.confidence_of_file = 0
        trans_end = time.time()  # datetime.datetime.now()
        self.perf_log(2, trans_start, trans_end, duration, linkedid)

        if phrases_count == 0:
            self.save_result(duration, '', '0', '0', side, transcribation_date,
                             0, original_file_name, rec_date, src, dst,
                             linkedid)

コード例 #22

0

ファイルを表示

ファイル: app.py プロジェクト: chengyehwang/jupyterlab

def trigger_microphone(n_clicks):
    if n_clicks == 0:
        return ''
    print('trigger microphone %d' % n_clicks)
    import termux
    termux.Microphone.stop()
    pwd = os.environ['PWD']
    aac_file = "%s/microphone.aac" % pwd
    wave_file = "%s/microphone.wave" % pwd
    if os.path.exists(aac_file):
        os.remove(aac_file)
    termux.Microphone.record(aac_file, encoder='aac', limit=5, count=2)
    import time
    time.sleep(6)
    os.system('faad -o %s %s' % (wave_file, aac_file))
    if False:
        import speech_recognition as sr
        r = sr.Recognizer()
        with sr.WavFile(wave_file) as source:
            audio = r.record(source)
        text = r.recognize_sphinx(audio)
    else:
        from vosk import Model, KaldiRecognizer, SetLogLevel
        import wave
        import numpy as np
        model_name = 'vosk-model-small-en-us-0.15'
        if not os.path.exists(model_name):
            os.system('wget http://alphacephei.com/vosk/models/%s.zip' %
                      model_name)
            os.system('unzip %s.zip' % model_name)
        wf = wave.open(wave_file, "rb")
        model = Model(model_name)
        rec = KaldiRecognizer(model, wf.getframerate())
        nch = wf.getnchannels()
        depth = wf.getsampwidth()
        typ = {1: np.uint8, 2: np.uint16, 4: np.uint32}.get(depth)
        sdata = wf.readframes(64000)
        data = np.frombuffer(sdata, dtype=typ)
        ch_data = data[0::nch]
        sdata = ch_data.tobytes()
        if True:
            outwav = wave.open('good.wave', 'w')
            outwav.setparams(wf.getparams())
            outwav.setnchannels(1)
            outwav.writeframes(ch_data.tobytes())
            outwav.close()

        if rec.AcceptWaveform(sdata):
            result = rec.Result()
            result = json.loads(result)
            text = result['text']
        else:
            result = rec.PartialResult()
            result = json.loads(result)
            text = result['partial']
        result = rec.FinalResult()
        result = json.loads(result)
        text += result['text']
    print('finish microphone')
    print('text:%s' % text)
    return text

コード例 #23

0

ファイルを表示

ファイル: extract_srt.py プロジェクト: tykayn/subextractor

def gen_subparts(input_file,
                 model_dir,
                 verbose=False,
                 partlen=4,
                 progress=False):
    SetLogLevel(0 if verbose else -1)

    model = Model(model_dir)
    rec = KaldiRecognizer(model, 16000)

    process = subprocess.Popen([
        'ffmpeg', '-loglevel', 'quiet', '-i', input_file, '-ar',
        str(16000), '-ac', '1', '-f', 's16le', '-'
    ],
                               stdout=subprocess.PIPE)

    r = subprocess.run(
        "ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1"
        .split() + [input_file],
        stdout=subprocess.PIPE)
    duration = float(r.stdout.decode('utf-8').strip())

    if progress:
        pbar = tqdm(total=duration, unit="s")

    prev_end = 0
    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            r = json.loads(rec.Result())
            if 'result' in r:
                resultpart = []  # TODO: use this across AccesptForm
                for result in r['result']:
                    if len(resultpart) > 0 and float(result['end']) - float(
                            resultpart[0]['start']) >= partlen:
                        yield SubPart(start=resultpart[0]['start'],
                                      end=float(resultpart[-1]['end']),
                                      text=" ".join(r['word']
                                                    for r in resultpart))
                        prev_end = float(resultpart[-1]['end'])
                        resultpart = []
                    if float(result['end'] - result['start']) >= partlen:
                        yield SubPart(start=float(result['start']),
                                      end=float(result['end']),
                                      text=result['word'])
                        prev_end = float(result['end'])
                        resultpart = []
                    else:
                        resultpart.append(result)
                    if progress:
                        pbar.update(float(result['end'] - pbar.n))

                if len(resultpart) > 0:
                    yield SubPart(start=float(resultpart[0]['start']),
                                  end=float(resultpart[-1]['end']),
                                  text=" ".join(r['word'] for r in resultpart))
                    prev_end = float(resultpart[-1]['end'])
                    resultpart = []

        else:
            pass
            #print(rec.PartialResult())
    #pprint(rec.PartialResult())
    if progress:
        pbar.close()
    r = json.loads(rec.PartialResult())
    text = r['partial']
    yield SubPart(start=prev_end, end=duration, text=text)

コード例 #24

0

ファイルを表示

ファイル: _speech_detector_snowboy.py プロジェクト: obiwan51/rcute-ai

class SpeechDetector:

    @classmethod
    def hotword_list(cl):
        return util.hotword_list_snowboy()

    @classmethod
    def hotword_model_list(cl):
        return util.hotword_model_list_snowboy()

    def __init__(self,
                    hotword_model=[util.restream(f'snowboy/hotword_models/{a}') for a in ['阿Q.pmdl']],
                    sensitivity=.5,
                    lang='zh',
                    audio_gain=1,
                    silence_timeout=2,
                    recognition_timeout=10):

        if not isinstance(hotword_model, list):
            hotword_model = [hotword_model]
        if isinstance(sensitivity, list):
            assert len(hotword_model) == len(sensitivity), 'Number of hotword does not match number of sensitivity'
        else:
            sensitivity = [sensitivity]* len(hotword_model)

        self._detect = snowboydetect.SnowboyDetect(restream_filename=util.resouce('snowboy/common.res').encode(), model_str=",".join(hotword_models).encode())
        self._detect.SetAudioGain(audio_gain)
        self._detect.ApplyFrontend(False)
        self._detect.SetSensitivity(','.join([str(s) for s in sensitivity]).encode())

        assert lang.lower() in ['en', 'zh', 'cn'], 'Only english and chinese is supported'
        self._rec = KaldiRecognizer(Model(util.resouce('sphinx/vosk-model-en-us-daanzu-20200328-lgraph') if lang=='en' else util.restream('sphinx/vosk-model-cn-0.1')), self._detect.SampleRate())

        self._hotwords = [w.split('/')[-1].split('.')[0] for w in hotword_model]
        self._recognition_timeout = int(recognition_timeout/self.required_buffer_size)
        self._silence_timeout = int(silence_timeout/self.required_buffer_size)

    @property
    def required_samplerate(self):
        return 16000

    @property
    def required_bit_depth(self):
        return 16

    @property
    def required_channels(self):
        return 1

    @property
    def required_buffer_size(self):
        return self.required_samplerate * self.required_bit_depth//8 * self.required_channels // 10 # 0.1 sec

    def stop(self):
        self._stop = True

    def detect(self, stream, *, hotword_callback=None, speech_callback=None):
        self._stop = False
        recognizing = False
        recognition_count = silence_count = 0
        for data in stream:
            if self._stop:
                return
            status = self._detect.RunDetection(data)
            if status == -1:
                logger.warning("Error initializing streams or reading audio data")
            if recognizing:
                if self._rec.AcceptWaveform(data):
                    speech_callback(json.loads(self._rec.Result())['text'].replace(' ',''))
                    recognizing = False
                else:
                    recognition_count += 1
                    if status == -2: # silence detected
                        silence_count += 1
                    else:
                        silence_count = 0
                    if recognition_count >= self._recognition_timeout or silence_count >= self._silence_timeout:
                        speech_callback(json.loads(self._rec.FinalResult())['text'].replace(' ',''))
                        recognizing = False

            elif status > 0:
                hotword_callback and hotword_callback(self._hotwords[status])
                if speech_callback:
                    recognition_count = silence_count = 0
                    recognizing = True


    def detect_once(self, stream, *, hotword_callback=None, speech_callback=None):
        self._stop = False
        recognizing = False
        recognition_count = silence_count = 0
        for data in stream:
            if self._stop:
                return
            status = self._detect.RunDetection(data)
            if status == -1:
                logger.warning("Error initializing streams or reading audio data")
            if recognizing:
                if self._rec.AcceptWaveform(data):
                    speech_callback(json.loads(self._rec.Result())['text'])
                    return
                else:
                    recognition_count += 1
                    if status == -2: # silence detected
                        silence_count += 1
                    else:
                        silence_count = 0
                    if recognition_count >= self._recognition_timeout or silence_count >= self._silence_timeout:
                        speech_callback(json.loads(self._rec.FinalResult())['text'])
                        return
            elif status > 0:
                hotword_callback and hotword_callback(self._hotwords[status])
                if speech_callback:
                    recognizing = True
                else:
                    return

コード例 #25

0

ファイルを表示

class Tester:

    def __init__(
            self,
            filepath: Optional[str],
            model_path: str,
            sample_rate: int,
            use_gpu: bool = False
    ):
        if use_gpu:
            # Gpu part, uncomment if vosk-api has gpu support
            from vosk import GpuInit, GpuInstantiate
            GpuInit()
            GpuInstantiate()

        self.sample_rate = sample_rate
        self.model = Model(model_path)
        self.rec = KaldiRecognizer(self.model, sample_rate)

        self.filepath = filepath

    def _read(self, out):

        while True:
            data = out.read(8000)

            if len(data) == 0:
                break

            if self.rec.AcceptWaveform(data):
                print(self.rec.Result())
            else:
                print(self.rec.PartialResult())

        print(self.rec.FinalResult())

    def _test_microphone(self):

        stream = PyAudio().open(
            format=paInt16,
            channels=1,
            rate=self.sample_rate,
            input=True,
            frames_per_buffer=8000
        )

        stream.start_stream()

        self._read(stream)

    def _test_file(self, filepath):
        process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i',
                                    filepath, '-ar', str(self.sample_rate),
                                    '-ac', '1', '-f', 's16le', '-'],
                                   stdout=subprocess.PIPE)

        self._read(process.stdout)

    def test(self):
        if self.filepath is None:
            self._test_microphone()
        else:
            self._test_file(self.filepath)

コード例 #26

0

ファイルを表示

ファイル: __init__.py プロジェクト: dyc3/crystal

class VoskInput(BaseInput):
    """ Uses the `vosk` package to do speech recognition. """
    def __init__(self):
        super(VoskInput, self).__init__()
        self.current_utterance = ""
        self.realtime = True  # indicates that audio can be streamed in

        model_name = crystal.core.get_config(
            'vosk_model') or 'vosk-model-small-en-us-0.3'
        log.info(f"Using vosk model: {model_name}")
        self.model = Model(f"models/{model_name}")
        self.rec = None
        self.__final_result = None

    def process_audio(self, raw_audio: bytes, sample_rate: int,
                      sample_width: int):
        if not self.rec:
            self.rec = KaldiRecognizer(self.model, sample_rate)
        full = self.rec.AcceptWaveform(raw_audio)
        if full:
            result = self.rec.Result()
        else:
            result = self.rec.PartialResult()
        log.debug(result)
        result = json.loads(result)
        if "result" in result:
            self.__final_result = result
        if "text" in result:
            text = result["text"]
        elif "partial" in result:
            text = result["partial"]
        if text:
            self.current_utterance = text
        return self.current_utterance

    def get_full_result(self):
        if self.__final_result:
            result = self.__final_result
        else:
            result = self.rec.FinalResult()
            result = json.loads(result)
        log.debug(result)
        self.rec = None
        self.current_utterance = ""
        self.__final_result = None

        full_text = result["text"]
        # HACK: auto correct text to match domain vocabulary. Sorry.
        full_text = full_text.replace("palace music", "pause music")
        full_text = full_text.replace("applause music", "pause music")
        if any(x in full_text
               for x in ["turn on", "turn off", "turned on", "turned off"]):
            full_text = full_text.replace("the land", "the lamp").replace(
                "the lamb", "the lamp")
            if full_text.endswith("the lam"):
                full_text = full_text.replace("the lam", "the lamp")
        if any(x in full_text for x in ["timer", "alarm"]):
            full_text = full_text.replace("crystal said", "crystal set")
        if full_text.endswith("to pm"):
            full_text = full_text.replace("to pm", "2 pm")
        elif full_text.endswith(" a m"):
            full_text = full_text.replace(" a m", " am")
        if full_text.startswith("christo"):
            full_text = full_text.replace("christo", "crystal")
        elif full_text.startswith("crews to"):
            full_text = full_text.replace("crews to", "crystal")
        elif full_text.startswith("christian"):
            full_text = full_text.replace("christian", "crystal")

        return full_text

コード例 #27

0

ファイルを表示

ファイル: speech_recognizer.py プロジェクト: anjok/rcute-ai

class SpeechRecognizer:
    """语音识别器，对 |CMUSphinx vosk| 的简单封装

    .. |CMUSphinx vosk| raw:: html

        <a href='https://github.com/alphacep/vosk-api' target='blank'>CMUSphinx vosk</a>

    :param lang: 语言，目前支持中文 `'zh'` 或英文 `'en'` ，默认中文
    :type lang: str, optinal
    """
    def __init__(self, lang='zh'):
        lang = lang.lower()
        self._lang = lang
        assert lang in ['en', 'zh',
                        'cn'], 'Only english and chinese is supported'
        self._rec = KaldiRecognizer(
            Model(
                util.resource('sphinx/vosk-model-en-us-daanzu-20200328-lgraph'
                              ) if lang ==
                'en' else util.resource('sphinx/vosk-model-cn-0.1')), 16000)
        self._detect = snowboydetect.SnowboyDetect(
            resource_filename=util.resource('snowboy/common.res').encode(),
            model_str=util.resource('snowboy/hotword_models/阿Q.pmdl').encode())
        self._detect.SetAudioGain(2)
        self._detect.ApplyFrontend(False)
        self._detect.SetSensitivity('0.5'.encode())

    def recognize(self, stream, timeout=10, silence_timeout=2):
        """开始识别

        :param stream: 音频数据流
        :param timeout: 超时，即最长的识别时间（秒），默认为 `10`，设为 `None` 则表示不设置超时
        :type timeout: float, optinal
        :param silence_timeout: 停顿超时（秒），超过这个时间没有说话则表示已经说完，默认为 `2`，设为 `None` 则表示不设置停顿超时
        :type silence_timeout: float, optinal
        :return: 识别到的短语或句子
        :rtype: str
        """
        self._cancel = False
        recognition_count = silence_count = 0.0
        for data in stream:
            if self._cancel:
                raise Exception(
                    'Speech recognition cancelled by another thread')

            if self._rec.AcceptWaveform(data):
                text = self._rec.Result()
                break

            ln = len(
                data
            ) / 32000  # 1 second = 16000(samplerate) * 2 bytes_per_sample
            recognition_count += ln
            if timeout and recognition_count > timeout:
                text = self._rec.FinalResult()
                break
            if self._detect.RunDetection(data) == -2:  # silence
                silence_count += ln
                if silence_timeout and silence_count > silence_timeout:
                    text = self._rec.FinalResult()
                    break

        text = json.loads(text)['text']
        if not self._lang == 'en':
            text = text.replace(' ', '')
        return text

    def cancel(self):
        """停止识别"""
        self._cancel = True

コード例 #28

0

ファイルを表示

def my_link():
    print("entered into function for processing")

    time.sleep(15)
    c = '0' + '.wav'
    counter = 0
    conn = connect()
    model = Model("vosk-model-small-en-in-0.4")
    pth = os.listdir(audio_path)
    #print(pth)
    print("entering into while loop")
    while (c in pth):
        #sound = AudioSegment.from_wav('C:/Users/admin/Downloads/' +  i )
        #sound = sound.set_channels(1) # To make it MONO Channel
        #sound = sound.set_frame_rate(44100) # Sample Frame rate taken here = 44,100 Hz
        #sound.export('C:/Users/admin/Downloads/' + i , format="wav")

        wf = wave.open(audio_path + '/' + c, 'rb')
        rec = KaldiRecognizer(model, wf.getframerate())
        while True:
            data = wf.readframes(CHUNK)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                pass

        dict = ast.literal_eval(
            rec.FinalResult())  #changing the string to dictionary

        print(c)
        s = dict["text"]
        print(s)
        #integrate tc model
        temp = remove_punct(s)
        temp = tknz_text(temp)
        temp = remove_stopwords(temp)
        temp = stmng(temp)
        #for removing punctions
        puncs = set([
            '"', '(', ')', '.', ',', '-', '<', '>', '/', '\',%', '\\x', '!',
            '?', "'", 's'
        ])
        temp2 = []
        for i in temp:
            if i[0].isalpha() == True:
                temp2.append(i)

        #for removing spaces
        temp1 = []
        for i in temp2:
            if i not in ("", '', " ", ' '):
                temp1.append(i)

        fg, word = check_list(conn, temp1)
        if fg == 1:
            #flash("Abusive Detected")
            print("Abusive Detected")
        else:
            print("Normal Text")
        print()
        #close_the_connection(conn)

        time.sleep(0.01)
        counter = counter + 1
        c = '0' + ' ' + '(' + str(counter) + ')' + '.wav'
        pth = os.listdir(audio_path)
        time.sleep(5)
    print("exiting while loop")
    delete()
    return redirect('http://127.0.0.1:5000/')

コード例 #29

0

ファイルを表示

ファイル: voske.py プロジェクト: oussaifi-majdi/Speech-To-Text-with-Python-use-Vosk

model = Model("model")
rec = KaldiRecognizer(model, 16000)

p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=16000,
                input=True,
                frames_per_buffer=8000)
stream.start_stream()
result = ""
init_time = time.time()
while True:
    current_time = time.time()
    if current_time - init_time < 5:
        data = stream.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            #print(rec.Result())
            result = result + " " + json.loads(rec.Result())['text']
            print(result)
        else:
            pass
    else:
        break

result = result + json.load(rec.FinalResult())['text']
print(result)

コード例 #30

0

ファイルを表示

def my_link():

    time.sleep(15)
    c = '0' + '.wav'
    counter = 0
    conn = connect()
    model = Model("vosk-model-small-en-in-0.4")
    pth = os.listdir(audio_path)
    while (c in pth):
        wf = wave.open(audio_path + '/' + c, 'rb')
        rec = KaldiRecognizer(model, wf.getframerate())
        while True:
            data = wf.readframes(CHUNK)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                pass
        dict = ast.literal_eval(
            rec.FinalResult())  #changing the string to dictionary
        print(c)
        #print(dict["text"])
        s = dict["text"]
        print(s)

        #Text classification starts
        temp = remove_punct(s)
        temp = tknz_text(temp)
        temp = remove_stopwords(temp)
        temp = stmng(temp)
        #for removing punctions
        puncs = set([
            '"', '(', ')', '.', ',', '-', '<', '>', '/', '\',%', '\\x', '!',
            '?', "'", 's'
        ])
        temp2 = []
        for i in temp:
            if i[0].isalpha() == True:
                temp2.append(i)

        #for removing spaces
        temp1 = []
        for i in temp2:
            if i not in ("", '', " ", ' '):
                temp1.append(i)

        print(temp1)
        fg, word = check_list(conn, temp1)
        if fg == 1:
            print("Abusive Detected")
        else:
            print("Normal Text")
        print()

        #close_the_connection(conn)
        time.sleep(0.01)
        counter = counter + 1
        c = '0' + ' ' + '(' + str(counter) + ')' + '.wav'
        pth = os.listdir(audio_path)
        #print(pth, "-->", c)
        time.sleep(5)
    try:
        delete()
    except:
        return render_template('index.html')
    return render_template('index.html')