Example #1
0
    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)
        print('Initializing aspire model...')
        decoder_opts = LatticeFasterDecoderOptions()
        decoder_opts.beam = 13
        decoder_opts.max_active = 7000
        decodable_opts = NnetSimpleComputationOptions()
        decodable_opts.acoustic_scale = 1.0
        decodable_opts.frame_subsampling_factor = 3
        decodable_opts.frames_per_chunk = 150
        self.asr = NnetLatticeFasterRecognizer.from_files(
            "/home/chris/git/pykaldi/examples/setups/aspire/exp/tdnn_7b_chain_online/final.mdl",
            "/home/chris/git/pykaldi/examples/setups/aspire/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst",
            "/home/chris/git/pykaldi/examples/setups/aspire/data/lang/words.txt",
            decoder_opts=decoder_opts,
            decodable_opts=decodable_opts)

        _, fn = tempfile.mkstemp()
        os.remove(fn)
        self.scp_fn = scp_fn = '%s.scp' % fn

        # Define feature pipelines as Kaldi rspecifiers
        self.feats_rspec = (
            f"ark:compute-mfcc-feats --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/mfcc_hires.conf scp:{scp_fn} ark:- |"
        )
        self.ivectors_rspec = (
            f"ark:compute-mfcc-feats --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/mfcc_hires.conf scp:{scp_fn} ark:- |"
            f"ivector-extract-online2 --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/ivector_extractor.conf " \
            f"ark:/home/chris/git/pykaldi/examples/setups/aspire/data/test/spk2utt ark:- ark:- |"
        )
Example #2
0
    def __init__(self, scp, model, graph, words, conf, iconf, spk2utt, output, printed=False, log=False):
        """
        Инициализация транскриптора
        
        Аргументы:
            scp: путь к .SCP файлу с аудио
            model: путь к .MDL файлу модели распознавания
            graph: путь к .FST файлу общего графа распознавания
            words: путь к .TXT файлу текстового корпуса
            conf: путь к .CONF конфигурационному файлу распознавания
            iconf: путь к .CONF конфигурационному файлу векторного экстрактора
            spk2utt: путь к файлу перечисления сегментов для каждого говорящего
            output: путь к директории с результатами распознавания
            printed: признак печати результатов распознавания
            log: признак логирования
        """  
        self.scp = scp
        self.model = model
        self.graph = graph
        self.words = words
        self.conf = conf
        self.iconf = iconf
        self.spk2utt = spk2utt
        self.output = Path(output)
        self.printed = printed
        self.log = log

        decoder_opts = LatticeFasterDecoderOptions()
        decoder_opts.beam = 13
        decoder_opts.max_active = 7000
        decodable_opts = NnetSimpleComputationOptions()
        decodable_opts.acoustic_scale = 1.0
        decodable_opts.frame_subsampling_factor = 3
        self.asr = NnetLatticeFasterRecognizer.from_files(self.model, self.graph, self.words,
                decoder_opts=decoder_opts, decodable_opts=decodable_opts)
Example #3
0
#!/usr/bin/env python

from __future__ import print_function

from kaldi.asr import NnetLatticeFasterRecognizer
from kaldi.decoder import LatticeFasterDecoderOptions
from kaldi.nnet3 import NnetSimpleComputationOptions
from kaldi.util.table import SequentialMatrixReader

# Construct recognizer
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13
decoder_opts.max_active = 7000
decodable_opts = NnetSimpleComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
decodable_opts.frames_per_chunk = 150
asr = NnetLatticeFasterRecognizer.from_files(
    "exp/tdnn_7b_chain_online/final.mdl",
    "exp/tdnn_7b_chain_online/graph_pp/HCLG.fst",
    "exp/tdnn_7b_chain_online/graph_pp/words.txt",
    decoder_opts=decoder_opts,
    decodable_opts=decodable_opts)

# Define feature pipelines as Kaldi rspecifiers
feats_rspec = (
    "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |")
ivectors_rspec = (
    "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |"
    "ivector-extract-online2 --config=conf/ivector.conf ark:data/spk2utt ark:- ark:- |"
)
Example #4
0
    def main(self):
        # Construct recognizer
        decoder_opts = LatticeFasterDecoderOptions()
        decoder_opts.beam = 13
        decoder_opts.max_active = 7000
        decodable_opts = NnetSimpleComputationOptions()
        decodable_opts.acoustic_scale = 1.0
        decodable_opts.frame_subsampling_factor = 3
        decodable_opts.frames_per_chunk = 150
        asr = NnetLatticeFasterRecognizer.from_files(
            self.dir_path + "/exp/tdnn_7b_chain_online/final.mdl",
            self.dir_path + "/new/graph/HCLG.fst",
            self.dir_path + "/new/graph/words.txt",
            decoder_opts=decoder_opts,
            decodable_opts=decodable_opts)

        p = pyaudio.PyAudio()

        # ############################################
        # sentiment_analyzer = SentimentAnalyzer(self.dir_path)
        # model = load_model(self.dir_path + '/lstm.h5')
        ############################################

        stream = p.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        frames_per_buffer=CHUNK)

        print("* recording")

        audio2send = []
        cur_data = ''  # current chunk  of audio data
        rel = RATE / CHUNK
        slid_win = deque(maxlen=int(SILENCE_LIMIT * rel) + 1)
        # Prepend audio from 0.5 seconds before noise was detected
        prev_audio = deque(maxlen=int(PREV_AUDIO * rel) + 1)
        started = False
        n = num_phrases
        response = []

        while num_phrases == -1 or n > 0:
            cur_data = stream.read(CHUNK)
            slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
            # print slid_win[-1]
            if sum([x > THRESHOLD for x in slid_win]) > 0:
                if (not started):
                    # print "Starting record of phrase"
                    started = True
                audio2send.append(cur_data)
            elif (started is True):
                # print "Finished"
                # The limit was reached, finish capture and deliver.
                filename = self.save_speech(list(prev_audio) + audio2send, p, self.SAVE_PATH, self.WAVE_OUTPUT_FILENAME)
                # Send file to Google and get response
                r = self.recognize_speech(asr)
                if num_phrases == -1:
                    print("Detected speech: ", r)
                    # if r != None:
                        # sentiment_analyzer.get_sentiment(r, model)
                else:
                    response.append(r)
                # Remove temp file. Comment line to review.
                os.remove(filename)
                # Reset all
                started = False
                slid_win = deque(maxlen=int(SILENCE_LIMIT * rel) + 1)
                prev_audio = deque(maxlen=int(0.5 * rel) + 1)
                audio2send = []
                n -= 1
            else:
                prev_audio.append(cur_data)

        stream.stop_stream()
        stream.close()
        p.terminate()
Example #5
0
    def init(self, nnet_directory, transcription_directory):
        return_msg = "KaldiDecoder:init"
        debug_data = []
        feats = ""
        ivectors = ""
        decoder_opts = None
        decodable_opts = None
        asr = None

        ## input validation
        if nnet_directory is not None:
            if type(nnet_directory) is not str:
                return_msg += "nnet_directory is not of type string, is type {}".format(
                    type(nnet_directory))
                return {
                    RDK.success: RC.input_validation,
                    RDK.return_msg: return_msg,
                    RDK.debug_data: debug_data
                }
        else:
            nnet_directory = KaldiNnetDecoder.CV_default_nnet_directory

        if transcription_directory is not None:
            if type(transcription_directory) is not str:
                return_msg += "transcription_directory is not of type string, is type {}".format(
                    type(transcription_directory))
                return {
                    RDK.success: RC.input_validation,
                    RDK.return_msg: return_msg,
                    RDK.debug_data: debug_data
                }
        else:
            transcription_directory = KaldiNnetDecoder.CV_default_transcription_directory
        ##</end> input validation

        ## feats and ivector rspec creation
        feats = (
            "ark:compute-mfcc-feats --config={0}/conf/mfcc.conf scp:{1}/wav.scp ark:- |"
        ).format(nnet_directory, transcription_directory)

        ivectors = (
            "ark:compute-mfcc-feats --config={0}/conf/mfcc.conf scp:{1}/wav.scp ark:- |"
            "ivector-extract-online2 --config={0}/conf/ivector_extractor.conf ark:{1}/spk2utt ark:- ark:- |"
        ).format(nnet_directory, transcription_directory)
        ##</end> feats and ivector rspec creation

        ## asr creation
        decoder_opts = LatticeFasterDecoderOptions()
        decoder_opts.beam = 13
        decoder_opts.max_active = 7000

        decodable_opts = NnetSimpleComputationOptions()
        decodable_opts.acoustic_scale = 1.0
        decodable_opts.frame_subsampling_factor = 3
        decodable_opts.frames_per_chunk = 150

        asr = NnetLatticeFasterRecognizer.from_files(
            "{}/final.mdl".format(nnet_directory),
            "{}/graph/HCLG.fst".format(nnet_directory),
            "{}/graph/words.txt".format(nnet_directory),
            decoder_opts=decoder_opts,
            decodable_opts=decodable_opts)
        ##</end> asr creation

        self.IV_feats = feats
        self.IV_ivectors = ivectors
        self.IV_asr = asr
        self.IV_is_ready = True

        return {
            RDK.success: RC.success,
            RDK.return_msg: return_msg,
            RDK.debug_data: debug_data
        }
Example #6
0
def asr(filenameS_hash, filenameS, asr_beamsize=13, asr_max_active=8000):
    models_dir = "models/"

    # Read yaml File
    config_file = "models/kaldi_tuda_de_nnet3_chain2.yaml"
    with open(config_file, 'r') as stream:
        model_yaml = yaml.safe_load(stream)
    decoder_yaml_opts = model_yaml['decoder']

    scp_filename = "tmp/%s.scp" % filenameS_hash
    wav_filename = "tmp/%s.wav" % filenameS_hash
    spk2utt_filename = "tmp/%s_spk2utt" % filenameS_hash

    # write scp file
    with open(scp_filename, 'w') as scp_file:
        scp_file.write("%s tmp/%s.wav\n" % (filenameS_hash, filenameS_hash))

    # write scp file
    with open(spk2utt_filename, 'w') as scp_file:
        scp_file.write("%s %s\n" % (filenameS_hash, filenameS_hash))

    # use ffmpeg to convert the input media file (any format!) to 16 kHz wav mono
    (
        ffmpeg
            .input(filename)
            .output("tmp/%s.wav" % filenameS_hash, acodec='pcm_s16le', ac=1, ar='16k')
            .overwrite_output()
            .run()
    )

    # Construct recognizer
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = asr_beamsize
    decoder_opts.max_active = asr_max_active
    decodable_opts = NnetSimpleComputationOptions()
    decodable_opts.acoustic_scale = 1.0
    decodable_opts.frame_subsampling_factor = 3
    decodable_opts.frames_per_chunk = 150
    asr = NnetLatticeFasterRecognizer.from_files(
        models_dir + decoder_yaml_opts["model"],
        models_dir + decoder_yaml_opts["fst"],
        models_dir + decoder_yaml_opts["word-syms"],
        decoder_opts=decoder_opts, decodable_opts=decodable_opts)

    # Construct symbol table
    symbols = SymbolTable.read_text(models_dir + decoder_yaml_opts["word-syms"])
    phi_label = symbols.find_index("#0")

    # Define feature pipelines as Kaldi rspecifiers
    feats_rspec = ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:- |") % \
                  (models_dir + decoder_yaml_opts["mfcc-config"])
    ivectors_rspec = (
            ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:-"
             + " | ivector-extract-online2 --config=%s ark:" + spk2utt_filename + " ark:- ark:- |") %
            ((models_dir + decoder_yaml_opts["mfcc-config"]),
             (models_dir + decoder_yaml_opts["ivector-extraction-config"]))
    )

    did_decode = False
    # Decode wav files
    with SequentialMatrixReader(feats_rspec) as f, \
            SequentialMatrixReader(ivectors_rspec) as i:
        for (fkey, feats), (ikey, ivectors) in zip(f, i):
            did_decode = True
            assert (fkey == ikey)
            out = asr.decode((feats, ivectors))
            best_path = functions.compact_lattice_shortest_path(out["lattice"])
            words, _, _ = get_linear_symbol_sequence(shortestpath(best_path))
            timing = functions.compact_lattice_to_word_alignment(best_path)

    assert(did_decode)

    # Maps words to the numbers
    words = indices_to_symbols(symbols, timing[0])

    # Creates the datastructure (Word, begin(Frames), end(Frames))
    vtt = list(map(list, zip(words, timing[1], timing[2])))

    # Cleanup tmp files
    print('removing tmp file:', scp_filename)
    os.remove(scp_filename)
    print('removing tmp file:', wav_filename)
    os.remove(wav_filename)
    print('removing tmp file:', spk2utt_filename)
    os.remove(spk2utt_filename)
    return vtt, words