Esempio n. 1
0
 def execute(self, audio, language=None):
     kaldi = KaldiRecognizer(self.model, 16000)
     kaldi.AcceptWaveform(audio.get_wav_data())
     res = kaldi.FinalResult()
     res = json.loads(res)
     return res["text"]
Esempio n. 2
0
import subprocess
import srt
import json
import datetime

SetLogLevel(-1)

if not os.path.exists("model"):
    print(
        "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
    )
    exit(1)

sample_rate = 16000
model = Model("model")
rec = KaldiRecognizer(model, sample_rate)
rec.SetWords(True)

process = subprocess.Popen([
    'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar',
    str(sample_rate), '-ac', '1', '-f', 's16le', '-'
],
                           stdout=subprocess.PIPE)

WORDS_PER_LINE = 7


def transcribe():
    results = []
    subs = []
    while True:
Esempio n. 3
0
import os
import wave

if not os.path.exists("model"):
    print(
        "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder."
    )
    exit(1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

model = Model("model")
# You can also specify the possible word list
rec = KaldiRecognizer(model, wf.getframerate(),
                      "zero oh one two three four five six seven eight nine")

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print(rec.Result())
    else:
        print(rec.PartialResult())

print(rec.FinalResult())
Esempio n. 4
0
#!/usr/bin/env python3
import math
import struct
import audioop
from time import sleep

from vosk import Model, KaldiRecognizer

import pyaudio

model = Model("vosk-model-small-ru-0.4")
rec = KaldiRecognizer(model, 16000)

Threshold = 400

SHORT_NORMALIZE = (1.0 / 32768.0)

sample_width = 16000


def get_audio():
    p = pyaudio.PyAudio()
    stream = p.open(input_device_index=6,
                    format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=4000)
    frames = []
    cnt = 0
    print('start')
Esempio n. 5
0
#!/usr/bin/python3

from vosk import Model, KaldiRecognizer
import sys
import json
import os

if not os.path.exists("model-en"):
    print ("Please download the model from https://github.com/alphacep/kaldi-android-demo/releases and unpack as 'model' in the current folder.")
    exit (1)


model = Model("model-en")

# You can also specify the possible word list
rec = KaldiRecognizer(model, 16000, "zero oh one two three four five six seven eight nine")

wf = open(sys.argv[1], "rb")
wf.read(44) # skip header

while True:
    data = wf.read(2000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        res = json.loads(rec.Result())
        print (res)
    else:
        res = json.loads(rec.PartialResult())
        print (res)
Esempio n. 6
0
import sys
import os
import wave

SetLogLevel(0)

if not os.path.exists("model"):
    print(
        "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
    )
    exit(1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

model = Model("model")
rec = KaldiRecognizer(model, wf.getframerate())
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print('RESULTADOS :', rec.Result())
    else:
        print(rec.PartialResult())

print(rec.FinalResult())
Esempio n. 7
0
class VoiceController:
	def __init__(self,
	             api_key,
	             wake_word="computer",
	             region="eastus",
	             languages=None):
		self.__api_key = api_key
		self.__wake_word = wake_word
		self.__region = region
		self.__languages = languages
		if not self.__languages:
			self.__languages = ["en-US"]

		# Are we currently processing a command?
		self.__active = False

		# Samples per second to read from mic (single channel)
		self.__framerate = 44100

		self.__config = speechsdk.SpeechConfig(subscription=self.__api_key,
		                                       region=self.__region)

		# Write sounds to this stream to send it to azure's speech recognition
		self.__hq_stream = speechsdk.audio.PushAudioInputStream(stream_format = speechsdk.audio.AudioStreamFormat(samples_per_second=self.__framerate))
		self.__audio_config = speechsdk.AudioConfig(stream = self.__hq_stream)

		if len(self.__languages) == 1:
			self.__hq_recognizer = speechsdk.SpeechRecognizer(
			    speech_config=self.__config, language=self.__languages[0], audio_config = self.__audio_config)
		else:
			self.__hq_recognizer = speechsdk.SpeechRecognizer(
			    speech_config=self.__config,
			    auto_detect_source_language_config=speechsdk.languageconfig.
			    AutoDetectSourceLanguageConfig(languages=self.__languages), audio_config = self.__audio_config)

		# Recognizer used to detect the wake word
		self.__lq_recognizer = KaldiRecognizer(Model("model"), self.__framerate,'["' + self.__wake_word + '"]')

		# Add callbacks to azure events
		self.__hq_recognizer.recognized.connect(self.on_recognized)
		self.__hq_recognizer.session_stopped.connect(self.on_session_stopped)
		self.__hq_recognizer.canceled.connect(self.on_session_stopped)

		# Callbacks
		self.on_ready = lambda *x: x
		self.on_triggered = lambda *x: x
		self.on_begin_command = lambda *x: x
		self.on_finish_command = lambda *x: x
		self.on_unknown_command = lambda *x: x
		self.on_error = lambda *x: x

		# List of commands
		self.__commands = []
		self.__alternatives = {}

		self.__buffer_size = 5
		self.wake_buffer = RingBuffer(self.__buffer_size * self.__framerate)

		self.recognized = threading.Event()
		self.mode = WAITING_FOR_WAKEWORD

		# Future returned by Azure's recognize_once_async, not sure what the point is since you can just
		# connect callbacks
		self.fut = None

		# Number of frames missed by the local recognizer while processing commands
		self.missed_frames = 0

	def on_session_stopped(self, evt):
		if self.fut:
			self.fut.get()
		self.fut = None

	def add_command(self, pattern, callback):
		self.__commands.append(CommandEntry(pattern, callback, self.__alternatives))

	def add_alternatives(self, word_or_dict, alts=[]):
		if type(word_or_dict) == dict:
			self.__alternatives.update(word_or_dict)
		else:
			if word_or_dict in self.__alternatives:
				self.__alternatives[word_or_dict] += alts
			else:
				self.__alternatives[word_or_dict] = alts

	def perform_all_commands(self, cmd):
		while True:
			has_match = False
			for command in self.__commands:
				result, next_command = command.try_invoke(cmd)
				if result:
					has_match = True
					if next_command:
						cmd = next_command
						break
					else:
						return
			if not has_match:
				break
		self.on_unknown_command(cmd)

	def on_recognized(self, event):
		try:
			speech = event.result.text.translate(str.maketrans('', '', string.punctuation)).lower()
			print("Recognized: {}".format(speech))
			self.perform_all_commands(speech)
			self.fut.get()
			self.fut = None

			self.mode = WAITING_FOR_WAKEWORD
		except Exception as e:
			print(e)

	def audio_callback(self, in_data, frame_count, time_info, status):
		if status:
			print(status)

		audio_data = np.fromstring(in_data, dtype=np.int16)
		if self.mode == WAITING_FOR_WAKEWORD:
			self.wake_buffer.extend(audio_data)
			if self.__lq_recognizer.AcceptWaveform(in_data):
				self.recognized.set()
		elif self.mode == RECORDING_COMMAND:
			self.__hq_stream.write(audio_data)
			self.missed_frames += frame_count


		return (None, pyaudio.paContinue)

	def reset_offline_recognizer(self):
		self.missed_frames = 0
		self.__lq_recognizer = KaldiRecognizer(Model("model"), self.__framerate,'["' + self.__wake_word + '"]')

	def recognize_stream(self):
		self.start_time = time.time()
		while True:
			self.recognized.wait()
			self.recognized.clear()

			result = self.__lq_recognizer.Result()

			jres = json.loads(result)
			if not self.__active and jres["text"] == self.__wake_word:
				self.on_triggered()
				self.mode = RECORDING_COMMAND

				wakeword_end_time = 0
				for res in jres["result"]:
					if res["word"] == self.__wake_word:
						wakeword_end_time = res["end"]

				lag = time.time() - self.start_time - wakeword_end_time - (self.missed_frames / self.__framerate)
				lag = int(round((lag) * self.__framerate))

				start_data = self.wake_buffer.get(lag)
				self.fut = self.__hq_recognizer.recognize_once_async()
				missed = start_data[:lag]
				missed = np.resize(missed, self.__framerate)
				self.__hq_stream.write(missed)


	def start_listening(self):
		p = pyaudio.PyAudio()
		stream = p.open(format=pyaudio.paInt16,
		                channels=1,
		                rate=self.__framerate,
		                input=True,
		                frames_per_buffer=1024,
						stream_callback=self.audio_callback)
		stream.start_stream()
		self.on_ready()
		self.recognize_stream()
Esempio n. 8
0
def no_internet(i):

    #speak('no internet ')
    from vosk import Model, KaldiRecognizer
    import threading
    import datetime
    import time
    from datetime import date
    today = date.today()
    t2 = threading.Thread(target=seeting)
    t1 = threading.Thread(target=texteditor)
    t3 = threading.Thread(target=terminal)
    t4 = threading.Thread(target=files)
    t5 = threading.Thread(target=cal)

    def text(Text):
        print("hi :", Text[1])
        list = Text.split('"')
        ste = list[-2]
        print("list", list[-2])
        # program.set(ste)
        # root.update()
        return ste
        # x = threading.Thread(target=ext,args=(ste,))
        # x.start()
        #x.join()

    if not os.path.exists("model"):
        print(
            "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
        )
        exit(1)

    import pyaudio

    model = Model("model")
    rec = KaldiRecognizer(model, 16000)

    p = pyaudio.PyAudio()
    print("hi baby", type(p))
    program.set("leassening...")
    root.update()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=8000)
    stream.start_stream()
    while i == 1:
        #x = threading.Thread(target=text(),args=(rec.Result()))
        data = stream.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            voice = text(rec.Result())
            if 'setting' in voice or 'seating' in voice:
                voice = voice.replace('open', '')
                try:
                    speak('opening setting')
                    program.set("opening setting")
                    root.update()
                    t2.start()
                except:
                    if t2.is_alive():
                        speak('already opening')
                    else:
                        t2.join()
                        del t2
                        t2 = threading.Thread(target=seeting)
                        t2.start()
            elif 'text editor' in voice or 'notepad' in voice or 'edit' in voice:
                voice = voice.replace('open', '')
                try:
                    speak('opening text editor')
                    program.set("opening text editor")
                    root.update()
                    t1.start()
                except:
                    if t1.is_alive():
                        speak('already opening')
                    else:
                        t1.join()
                        del t1
                        t1 = threading.Thread(target=texteditor)
                        t1.start()
            elif 'terminal' in voice or 'cmd' in voice or 'ter' in voice:
                voice = voice.replace('open', '')
                try:
                    speak('opening terminal')
                    program.set("opening terminal")
                    root.update()
                    t3.start()
                except:
                    if t3.is_alive():
                        speak('already opening')
                    else:
                        t3.join()
                        del t3
                        t3 = threading.Thread(target=terminal)
                        t3.start()
            elif 'calculator' in voice or 'cal' in voice:
                voice = voice.replace('open', '')
                try:
                    speak('opening calculator')
                    program.set("opening calculator")
                    root.update()
                    t5.start()
                except:
                    if t5.is_alive():
                        speak('already opening')
                    else:
                        t5.join()
                        del t5
                        t5 = threading.Thread(target=cal)
                        t5.start()
            elif 'files' in voice or 'file' in voice:
                voice = voice.replace('open', '')
                try:
                    speak('opening file system')
                    program.set("opening files")
                    root.update()
                    t4.start()
                except:
                    if t4.is_alive():
                        speak('already opening')
                    else:
                        t4.join()
                        del t4
                        t4 = threading.Thread(target=files)
                        t4.start()
            elif 'exit' in voice or 'tata' in voice or 'goodbye' in voice or 'quit' in voice or 'bye bye' in voice:
                speak("good bye dear have a great day")
                exit(1)
            elif 'time' in voice:
                time = datetime.datetime.now().strftime('%I:%M %p')
                program.set('Current time is =' + time)
                root.update()
                speak('Current time is ' + time)
            elif 'date' in voice or 'day' in voice or 'debt' in voice:
                d2 = today.strftime("%B %d, %Y")
                program.set(d2)
                root.update()
                speak(d2)
                print(d2)

            else:
                speak('tell what can i do without internet')
            i = i + 1
            print("Rosult", rec.Result())
            #x.start()
        else:
            rec.PartialResult()
Esempio n. 9
0
    if args.filename:
        dump_fn = open(args.filename, "wb")
    else:
        dump_fn = None

    with sd.RawInputStream(samplerate=args.samplerate,
                           blocksize=8000,
                           device=args.device,
                           dtype='int16',
                           channels=1,
                           callback=callback):
        print('#' * 80)
        print('Press Ctrl+C to stop the recording')
        print('#' * 80)

        rec = KaldiRecognizer(model, args.samplerate)
        while True:
            data = q.get()
            if rec.AcceptWaveform(data):
                print(rec.Result())
            #else:
            #print(rec.PartialResult())
            if dump_fn is not None:
                dump_fn.write(data)

except KeyboardInterrupt:
    print('\nDone')
    parser.exit(0)
except Exception as e:
    parser.exit(type(e).__name__ + ': ' + str(e))
Esempio n. 10
0
import pyaudio
from vosk import Model, KaldiRecognizer
import rospy
from std_msgs.msg import String

p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=16000,
                input=True,
                frames_per_buffer=8000)
stream.start_stream()

model = Model("model")
rec = KaldiRecognizer(model, 16000)

dictionary = {
    'вперёд': 'forward',
    'назад': 'backwards',
    'налево': 'left',
    'направо': 'right',
    'влево': 'left',
    'вправо': 'right',
    'стоп': 'stop',
    'разверн': 'turn around',
    'иди': 'go',
    'стой': 'stop'
}

Esempio n. 11
0
            print('Writing %d transcripts into file %s' %
                  (len(transcripts), outCTMFile))
            with open(outJSONFile, "w") as ofp:
                ofp.write(json.dumps(transcripts, indent=4))

            with open(outCTMFile, 'w') as ofp:
                for transcript in transcripts:
                    #print('\t%s (%s-%s-%s)\n' % (transcript['transcription'], sessionId, transcript['utterance_start'], transcript['utterance_duration']))
                    for token in transcript["tokens"]:
                        ofp.write("%s \t 1 \t %.2f \t %.2f \t %s\n" %
                                  (sessionId, token["start"],
                                   token["duration"], token["baseform"]))
            print(' ')

        elif args.engine == 'vosk':
            rec = KaldiRecognizer(VoskModel, sampleRate)
            rec.SetWords(True)
            # get the list of JSON dictionaries
            results = []
            if (useSegmentsInVosk):
                for segment in tqdm(segments):
                    del rec
                    rec = KaldiRecognizer(VoskModel, sampleRate)
                    rec.SetWords(True)
                    if (len(segment.bytes) == 0):
                        continue
                    if (rec.AcceptWaveform(segment.bytes)):
                        part_result = json.loads(rec.Result())
                        part_result['uttstart'] = segment.timestamp
                        results.append(part_result)
                    part_result = json.loads(rec.FinalResult())
Esempio n. 12
0
import os
import subprocess
import sys
import json

SetLogLevel(0)

if not os.path.exists("model"):
    print(
        "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
    )
    exit(1)

sample_rate = 16000
model = Model("model")
rec = KaldiRecognizer(model, sample_rate)

# process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i',
#                             'demo2.mp3',
#                             '-ar', str(sample_rate) , '-ac', '1', '-f', 's16le', '-'],
#                             stdout=subprocess.PIPE)
process = subprocess.Popen([
    'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar',
    str(sample_rate), '-ac', '1', '-f', 's16le', '-'
],
                           stdout=subprocess.PIPE)

totalResult = []
while True:
    data = process.stdout.read(4000)
    if len(data) == 0:
Esempio n. 13
0
SetLogLevel(-1)

if not os.path.exists("model"):
    print(
        "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
    )
    exit(1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

model = Model("model")
rec = KaldiRecognizer(model, wf.getframerate())

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        pass
        # (rec.Result())
    else:
        (rec.PartialResult())

text = (rec.FinalResult())
json = (json.loads(text))
print(json["text"])
Esempio n. 14
0
class VoskProcessor(EngineInterface):
    """Process chunks with Vosk"""
    def __init__(self, send_message, options: dict = None):
        """Create Vosk processor"""
        super().__init__(send_message)
        # Options
        if not options:
            options = {}
        # Common options - See 'EngineInterface'
        self._sample_rate = options.get("samplerate", float(16000))
        self._language = options.get("language")
        if self._language:
            self._language = self._language.replace(
                "_", "-")  # make sure we have xx-XX format
            self.language_code_short = re.split("[-]",
                                                self._language)[0].lower()
        else:
            self.language_code_short = None
        self._asr_model_path = options.get("model", None)
        self._continuous_mode = options.get("continuous", False)
        self._optimize_final_result = options.get("optimizeFinalResult", False)
        # Specific options
        self._alternatives = options.get("alternatives", int(1))
        self._return_words = options.get("words", False)
        try_speaker_detection = options.get("speaker", False)
        self._phrase_list = options.get("phrases")
        # example: self._phrase_list = ["hallo", "kannst du mich hören", "[unk]"]
        # NOTE: speaker detection does not work in all configurations
        if try_speaker_detection:
            self._speaker_detection = (settings.has_speaker_detection_model
                                       and self._alternatives == 0)
        else:
            self._speaker_detection = False
        # Recognizer
        if self._asr_model_path:
            # Reset language because model has higher priority
            if self._asr_model_path in settings.asr_model_paths:
                model_index = settings.asr_model_paths.index(
                    self._asr_model_path)
                self._language = settings.asr_model_languages[model_index]
            else:
                self._language = ""
        elif not self._language or self._language not in settings.asr_model_languages:
            self._asr_model_path = settings.asr_model_paths[0]
            self._language = settings.asr_model_languages[0]
        else:
            model_index = settings.asr_model_languages.index(self._language)
            self._asr_model_path = settings.asr_model_paths[model_index]
        asr_model_path = settings.asr_models_folder + self._asr_model_path
        # Speaker model
        spk_model_path = settings.speaker_models_folder + settings.speaker_model_paths[
            0]
        # Make sure paths exist and load models
        if self._asr_model_path not in settings.asr_model_paths:
            raise RuntimeError(
                "ASR model path is not defined in available paths")
        if not os.path.exists(asr_model_path):
            raise RuntimeError("ASR model path seems to be wrong")
        if self._speaker_detection and not os.path.exists(spk_model_path):
            raise RuntimeError("Speaker model path seems to be wrong")
        self._model = Model(asr_model_path)
        if self._speaker_detection:
            self._spk_model = SpkModel(spk_model_path)
        # Use phrase list?
        if self._phrase_list and len(self._phrase_list) > 0:
            self._recognizer = KaldiRecognizer(
                self._model, self._sample_rate,
                json.dumps(self._phrase_list, ensure_ascii=False))
        else:
            self._recognizer = KaldiRecognizer(self._model, self._sample_rate)
        self._recognizer.SetMaxAlternatives(self._alternatives)
        if self._return_words:
            self._recognizer.SetWords(True)
        if self._speaker_detection:
            self._recognizer.SetSpkModel(self._spk_model)
        self._partial_result = {}
        self._last_partial_str = ""
        self._final_result = {}
        # states - 0: waiting for input, 1: got partial result, 2: got final result, 3: closing
        self._state = 0
        #
        # TODO: GPU support: check Vosk examples to find out how to enable GPU ... :-P
        # Example code:
        # from vosk import GpuInit, GpuInstantiate
        # GpuInit()
        # def thread_init():
        #     GpuInstantiate()
        # pool = concurrent.futures.ThreadPoolExecutor(initializer=thread_init)

    async def process(self, chunk: bytes):
        """Feed audio chunks to recognizer"""
        result = None
        if self._state == 3:
            pass
        elif self._recognizer.AcceptWaveform(chunk):
            # Silence detected
            result = self._recognizer.Result()
            self._state = 2
            await self._handle_final_result(result)
        else:
            # Partial results possible
            result = self._recognizer.PartialResult()
            self._state = 1
            await self._handle_partial_result(result)
        # End?
        #if not self.accept_chunks:
        #    await self._finish()

    async def finish_processing(self):
        """Wait for last process and end"""
        # End?
        await self._finish()

    async def close(self):
        """Reset recognizer and remove"""
        #if self._recognizer:
        #self._recognizer.Reset()   # this throws an error!? Maye because its closed already?
        #self._recognizer = None

    def get_options(self):
        """Get Vosk options for active setup"""
        active_options = {
            "language": self._language,
            "model": self._asr_model_path,
            "samplerate": self._sample_rate,
            "optimizeFinalResult": self._optimize_final_result,
            "alternatives": self._alternatives,
            "continuous": self._continuous_mode,
            "words": self._return_words,
            "speaker": self._speaker_detection
        }
        if self._phrase_list and len(self._phrase_list) > 0:
            # NOTE: this can be very large, for now we use a placeholder
            active_options["phrases"] = []
            #active_options["phrases"] = self._phrase_list
        else:
            active_options["phrases"] = []
        return active_options

    async def _handle_partial_result(self, result):
        """Handle a partial result"""
        if result and self._last_partial_str != result:
            self._last_partial_str = result
            norm_result = VoskProcessor.normalize_result_format(
                result, self._alternatives, self._return_words)
            self._partial_result = norm_result
            #print("PARTIAL: ", self._partial_result)
            await self._send(self._partial_result, False)

    async def _handle_final_result(self, result, skip_send=False):
        """Handle a final result"""
        if result:
            #print("FINAL: ", result)
            norm_result = VoskProcessor.normalize_result_format(
                result, self._alternatives, self._return_words)
            if self._continuous_mode:
                # In continous mode we send "intermediate" final results
                self._final_result = norm_result
                if not skip_send:
                    await self._send(self._final_result, True)
            else:
                # In non-continous mode we remember one big result
                self._final_result = VoskProcessor.append_to_result(
                    self._final_result, norm_result)
            #print("FINAL (auto): ", self._final_result)

    async def _finish(self):
        """Tell recognizer to stop and handle last result"""
        last_result_was_final = (self._state == 2)
        self._state = 3
        if last_result_was_final and not self._continuous_mode:
            # Send final result (because we haven't done it yet)
            await self._send(self._final_result, True)
            # self._recognizer.Reset()  # TODO: we skip this to prevent ERROR if already reset
        elif last_result_was_final:
            # We don't need to do anything but reset ... right?
            # self._recognizer.Reset()  # TODO: we skip this to prevent ERROR if already reset
            pass
        else:
            # Request final
            result = self._recognizer.FinalResult()
            await self._handle_final_result(result, skip_send=True)
            await self._send(self._final_result, True)

    async def _send(self, json_result, is_final=False):
        """Send result"""
        features = {}
        alternatives = []
        if self._return_words:
            features["words"] = json_result.get("words", [])
        if self._speaker_detection:
            features["speaker_vector"] = json_result.get("spk", [])
        if self._alternatives > 0:
            alternatives = json_result.get("alternatives", [])
        transcript = json_result.get("text", "")
        # Post-processing?
        if is_final and transcript and self._optimize_final_result:
            # Optimize final transcription
            text2num_proc = TextToNumberProcessor(self._language)
            dt_optimizer = DateAndTimeOptimizer(self._language)
            transcript = text2num_proc.process(transcript)
            transcript = dt_optimizer.process(transcript)
        await self.send_transcript(transcript=transcript,
                                   is_final=is_final,
                                   confidence=json_result.get(
                                       "confidence", -1),
                                   features=features,
                                   alternatives=alternatives)

    # ---- Helper functions ----

    @staticmethod
    def normalize_result_format(result: str, alternatives=0, has_words=False):
        """Vosk has many different formats depending on settings
        Convert result into a fixed format so we can handle it better"""
        json_result = json.loads(result)
        words = None
        if alternatives > 0 and "alternatives" in json_result:
            json_result = json_result.get("alternatives", [])
            # handle array
            alternatives = None
            if len(json_result) > 1:
                alternatives = json_result[1:]
            if has_words:
                words = json_result[0].get("result")
            return VoskProcessor.build_normalized_result(
                json_result[0], alternatives, words)
        else:
            # handle object
            if has_words:
                words = json_result.get("result")
            return VoskProcessor.build_normalized_result(
                json_result, None, words)

    @staticmethod
    def build_normalized_result(json_result, alternatives=None, words=None):
        """Build a result object that always looks the same"""
        # text or partial or empty:
        text = json_result.get(
            "text", json_result.get("partial", json_result.get("final", "")))
        confidence = json_result.get("confidence", -1)
        speaker_vec = json_result.get("spk")
        result = {
            "text": text,
            "confidence": confidence,
            "alternatives": alternatives
        }
        if words is not None:
            result["words"] = words
        if speaker_vec is not None:
            result["spk"] = speaker_vec
        return result

    @staticmethod
    def append_to_result(given_result, new_result):
        """Append a new result to a previous one, typically used for
        'intermediate' final result text"""
        text = new_result.get("text")
        if not text:
            return given_result
        #else:            # we can do more post-processing here maybe
        if "text" in given_result:
            given_result["text"] += ", " + text
            if "confidence" in new_result:
                # sloppy confidence merge (take the worst)
                given_result["confidence"] = min(
                    given_result.get("confidence", -1),
                    new_result.get("confidence", -1))
            if "words" in new_result:
                # append words
                given_words = given_result.get("words", [])
                new_words = new_result.get("words", [])
                if given_words and len(given_words) and new_words and len(
                        new_words):
                    given_result["words"] = given_words + new_words
            if "spk" in new_result:
                # take new speaker data - NOTE: not optimal
                given_result["spk"] = new_result.get(
                    "spk", given_result.get("spk", []))
            return given_result
        else:
            new_result["text"] = text
            return new_result
Esempio n. 15
0
	def __init__(self,
	             api_key,
	             wake_word="computer",
	             region="eastus",
	             languages=None):
		self.__api_key = api_key
		self.__wake_word = wake_word
		self.__region = region
		self.__languages = languages
		if not self.__languages:
			self.__languages = ["en-US"]

		# Are we currently processing a command?
		self.__active = False

		# Samples per second to read from mic (single channel)
		self.__framerate = 44100

		self.__config = speechsdk.SpeechConfig(subscription=self.__api_key,
		                                       region=self.__region)

		# Write sounds to this stream to send it to azure's speech recognition
		self.__hq_stream = speechsdk.audio.PushAudioInputStream(stream_format = speechsdk.audio.AudioStreamFormat(samples_per_second=self.__framerate))
		self.__audio_config = speechsdk.AudioConfig(stream = self.__hq_stream)

		if len(self.__languages) == 1:
			self.__hq_recognizer = speechsdk.SpeechRecognizer(
			    speech_config=self.__config, language=self.__languages[0], audio_config = self.__audio_config)
		else:
			self.__hq_recognizer = speechsdk.SpeechRecognizer(
			    speech_config=self.__config,
			    auto_detect_source_language_config=speechsdk.languageconfig.
			    AutoDetectSourceLanguageConfig(languages=self.__languages), audio_config = self.__audio_config)

		# Recognizer used to detect the wake word
		self.__lq_recognizer = KaldiRecognizer(Model("model"), self.__framerate,'["' + self.__wake_word + '"]')

		# Add callbacks to azure events
		self.__hq_recognizer.recognized.connect(self.on_recognized)
		self.__hq_recognizer.session_stopped.connect(self.on_session_stopped)
		self.__hq_recognizer.canceled.connect(self.on_session_stopped)

		# Callbacks
		self.on_ready = lambda *x: x
		self.on_triggered = lambda *x: x
		self.on_begin_command = lambda *x: x
		self.on_finish_command = lambda *x: x
		self.on_unknown_command = lambda *x: x
		self.on_error = lambda *x: x

		# List of commands
		self.__commands = []
		self.__alternatives = {}

		self.__buffer_size = 5
		self.wake_buffer = RingBuffer(self.__buffer_size * self.__framerate)

		self.recognized = threading.Event()
		self.mode = WAITING_FOR_WAKEWORD

		# Future returned by Azure's recognize_once_async, not sure what the point is since you can just
		# connect callbacks
		self.fut = None

		# Number of frames missed by the local recognizer while processing commands
		self.missed_frames = 0
Esempio n. 16
0
 def __init__(self, send_message, options: dict = None):
     """Create Vosk processor"""
     super().__init__(send_message)
     # Options
     if not options:
         options = {}
     # Common options - See 'EngineInterface'
     self._sample_rate = options.get("samplerate", float(16000))
     self._language = options.get("language")
     if self._language:
         self._language = self._language.replace(
             "_", "-")  # make sure we have xx-XX format
         self.language_code_short = re.split("[-]",
                                             self._language)[0].lower()
     else:
         self.language_code_short = None
     self._asr_model_path = options.get("model", None)
     self._continuous_mode = options.get("continuous", False)
     self._optimize_final_result = options.get("optimizeFinalResult", False)
     # Specific options
     self._alternatives = options.get("alternatives", int(1))
     self._return_words = options.get("words", False)
     try_speaker_detection = options.get("speaker", False)
     self._phrase_list = options.get("phrases")
     # example: self._phrase_list = ["hallo", "kannst du mich hören", "[unk]"]
     # NOTE: speaker detection does not work in all configurations
     if try_speaker_detection:
         self._speaker_detection = (settings.has_speaker_detection_model
                                    and self._alternatives == 0)
     else:
         self._speaker_detection = False
     # Recognizer
     if self._asr_model_path:
         # Reset language because model has higher priority
         if self._asr_model_path in settings.asr_model_paths:
             model_index = settings.asr_model_paths.index(
                 self._asr_model_path)
             self._language = settings.asr_model_languages[model_index]
         else:
             self._language = ""
     elif not self._language or self._language not in settings.asr_model_languages:
         self._asr_model_path = settings.asr_model_paths[0]
         self._language = settings.asr_model_languages[0]
     else:
         model_index = settings.asr_model_languages.index(self._language)
         self._asr_model_path = settings.asr_model_paths[model_index]
     asr_model_path = settings.asr_models_folder + self._asr_model_path
     # Speaker model
     spk_model_path = settings.speaker_models_folder + settings.speaker_model_paths[
         0]
     # Make sure paths exist and load models
     if self._asr_model_path not in settings.asr_model_paths:
         raise RuntimeError(
             "ASR model path is not defined in available paths")
     if not os.path.exists(asr_model_path):
         raise RuntimeError("ASR model path seems to be wrong")
     if self._speaker_detection and not os.path.exists(spk_model_path):
         raise RuntimeError("Speaker model path seems to be wrong")
     self._model = Model(asr_model_path)
     if self._speaker_detection:
         self._spk_model = SpkModel(spk_model_path)
     # Use phrase list?
     if self._phrase_list and len(self._phrase_list) > 0:
         self._recognizer = KaldiRecognizer(
             self._model, self._sample_rate,
             json.dumps(self._phrase_list, ensure_ascii=False))
     else:
         self._recognizer = KaldiRecognizer(self._model, self._sample_rate)
     self._recognizer.SetMaxAlternatives(self._alternatives)
     if self._return_words:
         self._recognizer.SetWords(True)
     if self._speaker_detection:
         self._recognizer.SetSpkModel(self._spk_model)
     self._partial_result = {}
     self._last_partial_str = ""
     self._final_result = {}
     # states - 0: waiting for input, 1: got partial result, 2: got final result, 3: closing
     self._state = 0
Esempio n. 17
0
	def reset_offline_recognizer(self):
		self.missed_frames = 0
		self.__lq_recognizer = KaldiRecognizer(Model("model"), self.__framerate,'["' + self.__wake_word + '"]')
print('Video name is ' + video_name)

stream = os.popen('ffmpeg -i ' + video_name + ' ' + sys.argv[1])
output = stream.read()
output

if not os.path.exists("model"):
    print(
        "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
    )
    exit(1)

sample_rate = 16000
model = Model("model")
rec = KaldiRecognizer(model, sample_rate)

process = subprocess.Popen([
    'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar',
    str(sample_rate), '-ac', '1', '-f', 's16le', '-'
],
                           stdout=subprocess.PIPE)
while True:
    data = process.stdout.read(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        pass
    #        print(rec.Result())
    else:
        pass
Esempio n. 19
0
#!/usr/bin/python3

from vosk import Model, KaldiRecognizer
import sys
import json
import os

if not os.path.exists("model"):
    print ("Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder.")
    exit (1)


model = Model("model")

# Large vocabulary free form recognition
rec = KaldiRecognizer(model, 16000)

# You can also specify the possible word list
#rec = KaldiRecognizer(model, 16000, "zero oh one two three four five six seven eight nine")

wf = open(sys.argv[1], "rb")
wf.read(44) # skip header

while True:
    data = wf.read(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        res = json.loads(rec.Result())
        print (res['text'])
Esempio n. 20
0
from vosk import Model, KaldiRecognizer, SetLogLevel
import sys
import os
import wave
import json

SetLogLevel(0)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

model = Model(lang="en-us")
rec = KaldiRecognizer(model, wf.getframerate())

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print(rec.Result())
        break

    else:
        jres = json.loads(rec.PartialResult())
        print(jres)

        if jres['partial'] == "one zero zero zero":
            print("We can reset recognizer here and start over")
Esempio n. 21
0
    def recognize(self, body):
        remote_file_path = body
        try:
            dialogue_id = (remote_file_path.split('/')[1]).split('.')[0]
            print('Dialogue id is {}'.format(dialogue_id))
            local_file_path = os.path.join(self.sftp_client.download_path, remote_file_path.split('/')[1])
            print(local_file_path, remote_file_path)
            self.sftp_client.download_file_local(local_file_path, remote_file_path)
        except Exception as e:
            print('Exception occured, wrong filename format {}'.format(remote_file_path))
            print('Exception occured {}'.format(e))
            exit(1)

        recognition_result = []
        stt_recognizer = KaldiRecognizer(self.model, self.rate)
        if stt_recognizer is not None:
            try:
                wf = wave.open(local_file_path, "rb")
                if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
                    print("Audio file must be WAV format mono PCM.")
                    exit(1)

                while True:
                    data = wf.readframes(8000)
                    if len(data) == 0:
                        break
                    if stt_recognizer.AcceptWaveform(data):
                        recognition_chunk = json.loads(stt_recognizer.Result())
                        if 'result' in recognition_chunk.keys():
                            recognition_result.append(recognition_chunk['result'])
                    else:
                        recognition_chunk = json.loads(stt_recognizer.PartialResult())
                        if 'result' in recognition_chunk.keys():
                            recognition_result.append(recognition_chunk['result'])
                print("Recognition result {}".format(json.dumps(recognition_result)))
                for phrase in recognition_result:
                    for word in phrase:
                        word['word'] = word['word'].replace("'", ' ')

                recognition_result = self.process_sttresult(recognition_result)
                # print('Result is {}'.format(json.dumps(recognition_result)))

                psql_client = PostgresClient()
                psql_client.init_app(config=self.config)
                psql_client.update_stt_result(result=json.dumps(recognition_result, ensure_ascii=False), dialogue_id=dialogue_id)

                print('Deleting local path {}'.format(local_file_path))
                os.remove(local_file_path)
                print('Function finished, result of recognition {}'.format(recognition_result))
            except Exception as e:
                try:
                    psql_client = PostgresClient()
                    psql_client.init_app(config=self.config)
                    cur_time = datetime.utcnow()
                    creation_time = psql_client.get_creation_time(dialogue_id=dialogue_id)
                    if (cur_time - creation_time).total_seconds() / 3600 > 3.:
                        psql_client.update_error_status(dialogue_id)
                    print('Exception occured {}, recognition longs to musch period of time'.format(e))
                except:
                    exit(1)
        else:
            print('Please, init stt recognizer')
 def get_recognizer(self, framerate):
     SetLogLevel(-1)
     model = Model(os.path.join(c.PLUGIN_PATH, "vosk_alternatives",
                                "model"))
     rec = KaldiRecognizer(model, framerate)
     return rec
#Import the core lib
from core import SystemInfo
#Speech Synthesis
engine = pyttsx3.init()


def speak(text):
    engine.say(text)
    engine.runAndWait()


#Speech recognition

model = Model("model")
rec = KaldiRecognizer(model, 16000)

# Opens microphone for listening.
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=16000,
                input=True,
                frames_per_buffer=8000)
stream.start_stream()

while True:
    data = stream.read(10000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
Esempio n. 24
0
class Recognizer():
    def __init__(self):
        self.Threshold = 0
        self.hot_word = 'пирс'
        self.flag = False
        # vosk
        self.model = Model("speech_model")
        self.rec = KaldiRecognizer(self.model, RATE)
        # pyaudio
        self.audio = pyaudio.PyAudio()
        self.stream = self.audio.open(format=FORMAT,
                                      channels=CHANNELS,
                                      rate=RATE,
                                      input=True,
                                      frames_per_buffer=FPB)
        self.stream.start_stream()

    # rms(rated maximum sinusoidal) noise calculation
    @staticmethod
    def rms(frame):
        count = len(frame) / SAMPLE_WIDTH
        form = "%dh" % count
        shorts = struct.unpack(form, frame)
        sum_squares = 0.0
        for sample in shorts:
            n = sample * SHORT_NORMALIZE
            sum_squares += n * n
        rms = sqrt(sum_squares / count)
        return rms * 1000

    # Automatically adjusts microphone level to the environment
    def adjustment_to_noise(self, duration=1):
        seconds_per_buffer = FPB / RATE
        end_time = 0
        while True:
            end_time += seconds_per_buffer
            if end_time > duration:
                break
            data = self.stream.read(FPB)
            rms = self.rms(data)
            damping = 0.15**seconds_per_buffer
            target_rms = rms * 1.5
            self.Threshold = Energy_speech * damping * target_rms * (1 -
                                                                     damping)

    def speech_to_text(self):
        self.adjustment_to_noise()
        task = ''
        now = time.time()
        end = time.time() + TIMEOUT_LENGTH
        while now <= end:
            data = self.stream.read(FPB)
            # checking the ambient volume
            if self.rms(data) >= self.Threshold:
                end = time.time() + TIMEOUT_LENGTH / 1.2
            now = time.time()
            # vosk
            if self.rec.AcceptWaveform(data):
                text = json.loads(self.rec.Result())
                task = text['text']
        return task

    def start(self):
        while True:
            if self.flag:
                data = self.stream.read(FPB)
                if self.rec.AcceptWaveform(data):
                    text = json.loads(self.rec.Result())
                    task = text['text']
                    if self.hot_word in task:
                        playsound("audio/listen_to_you.mp3")
                        return True
Esempio n. 25
0
if not os.path.exists(spk_model_path):
    print(
        "Please download the speaker model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as {} in the current folder."
        .format(spk_model_path))
    exit(1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

# Large vocabulary free form recognition
model = Model(model_path)
spk_model = SpkModel(spk_model_path)
rec = KaldiRecognizer(model, spk_model, wf.getframerate())

# We compare speakers with cosine distance. We can keep one or several fingerprints for the speaker in a database
# to distingusih among users.
spk_sig = [
    4.658117, 1.277387, 3.346158, -1.473036, -2.15727, 2.461757, 3.76756,
    -1.241252, 2.333765, 0.642588, -2.848165, 1.229534, 3.907015, 1.726496,
    -1.188692, 1.16322, -0.668811, -0.623309, 4.628018, 0.407197, 0.089955,
    0.920438, 1.47237, -0.311365, -0.437051, -0.531738, -1.591781, 3.095415,
    0.439524, -0.274787, 4.03165, 2.665864, 4.815553, 1.581063, 1.078242,
    5.017717, -0.089395, -3.123428, 5.34038, 0.456982, 2.465727, 2.131833,
    4.056272, 1.178392, -2.075712, -1.568503, 0.847139, 0.409214, 1.84727,
    0.986758, 4.222116, 2.235512, 1.369377, 4.283126, 2.278125, -1.467577,
    -0.999971, 3.070041, 1.462214, 0.423204, 2.143578, 0.567174, -2.294655,
    1.864723, 4.307356, 2.610872, -1.238721, 0.551861, 2.861954, 0.59613,
    -0.715396, -1.395357, 2.706177, -2.004444, 2.055255, 0.458283, 1.231968,
Esempio n. 26
0
import subprocess
import srt
import json
import datetime

SetLogLevel(-1)

if not os.path.exists("model"):
    print(
        "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
    )
    exit(1)

sample_rate = 16000
model = Model("model")
rec = KaldiRecognizer(model, sample_rate)

process = subprocess.Popen([
    'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar',
    str(sample_rate), '-ac', '1', '-f', 's16le', '-'
],
                           stdout=subprocess.PIPE)

WORDS_PER_LINE = 7


def transcribe():
    results = []
    subs = []
    while True:
        data = process.stdout.read(4000)
Esempio n. 27
0
import wave

SetLogLevel(0)

if not os.path.exists("model"):
    print(
        "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder."
    )
    exit(1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

model = Model("model")
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print(rec.Result())
    else:
        print(rec.PartialResult())

print(rec.FinalResult())
Esempio n. 28
0
#!/usr/bin/env python3

from vosk import Model, KaldiRecognizer
import os
import pyaudio

model = Model('model')
rec = KaldiRecognizer(model, 16000)

p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=16000,
                input=True,
                frames_per_buffer=8000)
stream.start_stream()

while True:
    data = stream.read(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print(rec.Result())
    else:
        print(rec.PartialResult())

print(rec.FinalResult())
Esempio n. 29
0
import json

SetLogLevel(0)

model = Model("/home/jim/Playing/model")

if sys.argv and sys.argv[0]:
    dir = Path(sys.argv[1])
else:
    exit()

if not dir.is_dir():
    exit()

for file in dir.glob('*.mp3'):
    rec = KaldiRecognizer(model, 16000)
    rec.SetWords(True)
    output = []
    outfile = dir / f"{file.stem}.json"
    print(f"{file}\n")
    process = subprocess.Popen([
        'ffmpeg', '-loglevel', 'quiet', '-i',
        str(file), '-ar', '16000', '-ac', '1', '-f', 's16le', '-'
    ],
                               stdout=subprocess.PIPE)

    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):