def wordTimestamp(self, text, lattice, frame_shift, frame_subsampling):
        try:
            _fst.utils.scale_compact_lattice(
                [[1.0, 0], [0, float(self.acwt)]], lattice)
            bestPath = compact_lattice_shortest_path(lattice)
            _fst.utils.scale_compact_lattice(
                [[1.0, 0], [0, 1.0/float(self.acwt)]], bestPath)
            bestLattice = word_align_lattice(
                bestPath, self.transition_model, self.info, 0)
            alignment = compact_lattice_to_word_alignment(bestLattice[1])
            words = _fst.indices_to_symbols(self.symbols, alignment[0])
            start = alignment[1]
            dur   = alignment[2]

            output = {}
            output["words"] = []
            for i in range(len(words)):
                meta = {}
                meta["word"] = words[i]
                meta["start"] = round(start[i] * frame_shift * frame_subsampling, 2)
                meta["end"] = round((start[i]+dur[i]) * frame_shift * frame_subsampling, 2)
                output["words"].append(meta)
                text += " "+meta["word"]
            output["text"] = text

        except Exception as e:
            self.log.error(e)
            raise ValueError("Decoder failed to create the word timestamps!!!")
        else:
            return output
Esempio n. 2
0
def asr(filenameS_hash, filenameS, asr_beamsize=13, asr_max_active=8000):
    models_dir = "models/"

    # Read yaml File
    config_file = "models/kaldi_tuda_de_nnet3_chain2.yaml"
    with open(config_file, 'r') as stream:
        model_yaml = yaml.safe_load(stream)
    decoder_yaml_opts = model_yaml['decoder']

    scp_filename = "tmp/%s.scp" % filenameS_hash
    wav_filename = "tmp/%s.wav" % filenameS_hash
    spk2utt_filename = "tmp/%s_spk2utt" % filenameS_hash

    # write scp file
    with open(scp_filename, 'w') as scp_file:
        scp_file.write("%s tmp/%s.wav\n" % (filenameS_hash, filenameS_hash))

    # write scp file
    with open(spk2utt_filename, 'w') as scp_file:
        scp_file.write("%s %s\n" % (filenameS_hash, filenameS_hash))

    # use ffmpeg to convert the input media file (any format!) to 16 kHz wav mono
    (
        ffmpeg
            .input(filename)
            .output("tmp/%s.wav" % filenameS_hash, acodec='pcm_s16le', ac=1, ar='16k')
            .overwrite_output()
            .run()
    )

    # Construct recognizer
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = asr_beamsize
    decoder_opts.max_active = asr_max_active
    decodable_opts = NnetSimpleComputationOptions()
    decodable_opts.acoustic_scale = 1.0
    decodable_opts.frame_subsampling_factor = 3
    decodable_opts.frames_per_chunk = 150
    asr = NnetLatticeFasterRecognizer.from_files(
        models_dir + decoder_yaml_opts["model"],
        models_dir + decoder_yaml_opts["fst"],
        models_dir + decoder_yaml_opts["word-syms"],
        decoder_opts=decoder_opts, decodable_opts=decodable_opts)

    # Construct symbol table
    symbols = SymbolTable.read_text(models_dir + decoder_yaml_opts["word-syms"])
    phi_label = symbols.find_index("#0")

    # Define feature pipelines as Kaldi rspecifiers
    feats_rspec = ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:- |") % \
                  (models_dir + decoder_yaml_opts["mfcc-config"])
    ivectors_rspec = (
            ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:-"
             + " | ivector-extract-online2 --config=%s ark:" + spk2utt_filename + " ark:- ark:- |") %
            ((models_dir + decoder_yaml_opts["mfcc-config"]),
             (models_dir + decoder_yaml_opts["ivector-extraction-config"]))
    )

    did_decode = False
    # Decode wav files
    with SequentialMatrixReader(feats_rspec) as f, \
            SequentialMatrixReader(ivectors_rspec) as i:
        for (fkey, feats), (ikey, ivectors) in zip(f, i):
            did_decode = True
            assert (fkey == ikey)
            out = asr.decode((feats, ivectors))
            best_path = functions.compact_lattice_shortest_path(out["lattice"])
            words, _, _ = get_linear_symbol_sequence(shortestpath(best_path))
            timing = functions.compact_lattice_to_word_alignment(best_path)

    assert(did_decode)

    # Maps words to the numbers
    words = indices_to_symbols(symbols, timing[0])

    # Creates the datastructure (Word, begin(Frames), end(Frames))
    vtt = list(map(list, zip(words, timing[1], timing[2])))

    # Cleanup tmp files
    print('removing tmp file:', scp_filename)
    os.remove(scp_filename)
    print('removing tmp file:', wav_filename)
    os.remove(wav_filename)
    print('removing tmp file:', spk2utt_filename)
    os.remove(spk2utt_filename)
    return vtt, words
rnnlm_opts.eos_index = symbols.find_index("</s>")
rnnlm_opts.brk_index = symbols.find_index("<brk>")
compose_opts = ComposeLatticePrunedOptions()
compose_opts.lattice_compose_beam = 4
rescorer = LatticeRnnlmPrunedRescorer.from_files(
    "lm/G.carpa",
    "rnnlm-get-word-embedding lm/word_feats.txt lm/feat_embedding.final.mat -|",
    "lm/final.raw",
    acoustic_scale=1.0,
    max_ngram_order=4,
    use_const_arpa=True,
    opts=rnnlm_opts,
    compose_opts=compose_opts)

# Define feature pipelines as Kaldi rspecifiers
feats_rspec = "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:- |"
ivectors_rspec = (
    "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:-"
    " | ivector-extract-online2 --config=ivector.conf ark:spk2utt ark:- ark:- |"
)

# Decode wav files
with SequentialMatrixReader(feats_rspec) as f, \
     SequentialMatrixReader(ivectors_rspec) as i:
    for (fkey, feats), (ikey, ivectors) in zip(f, i):
        assert (fkey == ikey)
        out = asr.decode((feats, ivectors))
        rescored_lat = rescorer.rescore(out["lattice"])
        words, _, _ = get_linear_symbol_sequence(shortestpath(rescored_lat))
        print(fkey, " ".join(indices_to_symbols(symbols, words)), flush=True)