Beispiel #1
0
    def __init__(self, idx_to_char, params={}):

        self.idx_to_char = idx_to_char

        self.reorder_1, self.reorder_2 = create_phone_map(
            params['phones_path'], idx_to_char)
        self.word_syms = SymbolTable.read_text(params['words_path'])

        self.acoustic_scale = params.get('acoustic', 1.2)
        if self.acoustic_scale < 0:
            print("Warning: acoustic scale is less than 0")
        allow_partial = params.get('allow_partial', True)
        beam = params.get('beam', 13)
        self.alphaweight = params.get('alphaweight', 0.3)

        trans_model = TransitionModel()
        with xopen(params['mdl_path']) as ki:
            trans_model.read(ki.stream(), ki.binary)

        decoder_opts = FasterDecoderOptions()
        decoder_opts.beam = beam

        decode_fst = read_fst_kaldi(params['fst_path'])

        self.decoder_opts = decoder_opts
        self.trans_model = trans_model
        self.decode_fst = decode_fst

        self.stats = LMStats()
        self.stats_state = None
        self.add_stats_phase = True
Beispiel #2
0
def create_phone_map(filename, idx_to_char):

    #Old code for parsing. We can read in as utf8 directly with this method
    # dictSave = {}
    # with codecs.open(filename,'r',encoding='utf8') as f:
    #     data = f.read()
    #
    # for index, text in enumerate(data.split("\n")):
    #     entries = re.split('\s', text, 2)
    #     if (len(entries)<2 or len(entries[1])==0):
    #         continue
    #     dictSave[entries[0]] = int(entries[1])
    #
    # dictSave['EPS'] = dictSave['NON']
    # for key in DICT_MORPH:
    #     if (dictSave.get(DICT_MORPH[key],None) is None):
    #         continue
    #     dictSave[key] = dictSave[DICT_MORPH[key]]

    # I perfer to used the library to parse the symbol
    # table, but it doesn't read in as utf8
    ph_to_idx = {}
    phone_table = SymbolTable.read_text(filename)
    for i in range(phone_table.num_symbols()):
        phone_sym = phone_table.find_symbol(i).decode('utf8')
        ph_to_idx[phone_sym] = i

    ph_to_idx['EPS'] = ph_to_idx['NON']
    for key in DICT_MORPH:
        if ph_to_idx.get(DICT_MORPH[key], None) is None:
            continue
        ph_to_idx[key] = ph_to_idx[DICT_MORPH[key]]

    reorder_1 = []
    reorder_2 = []
    for pyphnid in range(len(idx_to_char) + 1):
        if pyphnid == 0:
            a = "EPS"
        else:
            a = idx_to_char[pyphnid]
            a = DICT_MORPH.get(a, a)
        newa = ph_to_idx.get(a, None)
        if newa == None:
            continue

        reorder_1.append(newa - 1)
        reorder_2.append(pyphnid)

    reorder_1 = np.array(reorder_1)
    reorder_2 = np.array(reorder_2)

    return reorder_1, reorder_2
Beispiel #3
0
from kaldi.lat.align import WordBoundaryInfoNewOpts, WordBoundaryInfo
from kaldi.nnet3 import NnetSimpleComputationOptions
from kaldi.util.table import SequentialMatrixReader

# Construct aligner
decodable_opts = NnetSimpleComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
decodable_opts.frames_per_chunk = 150
aligner = NnetAligner.from_files("exp/tdnn_7b_chain_online/final.mdl",
                                 "exp/tdnn_7b_chain_online/tree",
                                 "exp/langdir/L.fst",
                                 "exp/langdir/words.txt",
                                 "exp/langdir/phones/disambig.int",
                                 decodable_opts=decodable_opts)
phones = SymbolTable.read_text("exp/langdir/phones.txt")
wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(),
                                     "exp/langdir/phones/word_boundary.int")

# Define feature pipelines as Kaldi rspecifiers
feats_rspec = (
    "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |")
ivectors_rspec = (
    "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |"
    "ivector-extract-online2 --config=conf/ivector.conf ark:data/spk2utt ark:- ark:- |"
)

# Align wav files
with SequentialMatrixReader(feats_rspec) as f, \
     SequentialMatrixReader(ivectors_rspec) as i, \
     open("data/text") as t, \
Beispiel #4
0
from kaldi.lat.align import WordBoundaryInfoNewOpts, WordBoundaryInfo
from kaldi.nnet3 import NnetSimpleComputationOptions
from kaldi.util.table import SequentialMatrixReader

# Construct aligner
decodable_opts = NnetSimpleComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
decodable_opts.frames_per_chunk = 150
aligner = NnetAligner.from_files("exp/tdnn_7b_chain_online/final.mdl",
                                 "exp/tdnn_7b_chain_online/tree",
                                 "data/lang/L.fst",
                                 "data/lang/words.txt",
                                 "data/lang/phones/disambig.int",
                                 decodable_opts=decodable_opts)
phones = SymbolTable.read_text("data/lang/phones.txt")
wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(),
                                     "data/lang/phones/word_boundary.int")

# Define feature pipelines as Kaldi rspecifiers
feats_rspec = (
    "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |"
)
ivectors_rspec = (
    "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |"
    "ivector-extract-online2 --config=conf/ivector_extractor.conf ark:data/test/spk2utt ark:- ark:- |"
)

# Align wav files
with SequentialMatrixReader(feats_rspec) as f, \
     SequentialMatrixReader(ivectors_rspec) as i, \
Beispiel #5
0
def gmm_decode_faster(model_rxfilename, fst_rxfilename,
                      feature_rspecifier, words_wspecifier,
                      alignment_wspecifier="", lattice_wspecifier="",
                      word_symbol_table="", acoustic_scale=0.1,
                      allow_partial=True, decoder_opts=FasterDecoderOptions()):
    # Read model.
    trans_model = TransitionModel()
    am_gmm = AmDiagGmm()
    with xopen(model_rxfilename) as ki:
        trans_model.read(ki.stream(), ki.binary)
        am_gmm.read(ki.stream(), ki.binary)

    # Open table readers/writers.
    feature_reader = SequentialMatrixReader(feature_rspecifier)
    words_writer = IntVectorWriter(words_wspecifier)
    alignment_writer = IntVectorWriter(alignment_wspecifier)
    clat_writer = CompactLatticeWriter(lattice_wspecifier)

    # Read symbol table.
    word_syms = None
    if word_symbol_table != "":
        word_syms = SymbolTable.read_text(word_symbol_table)
        if not word_syms:
            raise RuntimeError("Could not read symbol table from file {}"
                               .format(word_symbol_table))

    # NOTE:
    # It is important to read decode_fst after opening feature reader as
    # it can prevent crashes on systems without enough virtual memory.

    # Read decoding graph and instantiate decoder.
    decode_fst = read_fst_kaldi(fst_rxfilename)
    decoder = FasterDecoder(decode_fst, decoder_opts)

    tot_like = 0.0
    frame_count = 0
    num_success, num_fail = 0, 0
    start = time.time()

    for key, features in feature_reader:
        if features.num_rows == 0:
            num_fail += 1
            logging.warning("Zero-length utterance: {}".format(key))
            continue

        gmm_decodable = DecodableAmDiagGmmScaled(am_gmm, trans_model,
                                                 features, acoustic_scale)
        decoder.decode(gmm_decodable)

        if not (allow_partial or decoder.reached_final()):
            num_fail += 1
            logging.warning("Did not successfully decode utterance {}, len = {}"
                            .format(key, features.num_rows))
            continue

        try:
            best_path = decoder.get_best_path()
        except RuntimeError:
            num_fail += 1
            logging.warning("Did not successfully decode utterance {}, len = {}"
                            .format(key, features.num_rows))
            continue

        if not decoder.reached_final():
            logging.warning("Decoder did not reach end-state, outputting "
                            "partial traceback since --allow-partial=true")

        ali, words, weight = get_linear_symbol_sequence(best_path)

        words_writer[key] = words

        if alignment_writer.is_open():
            alignment_writer[key] = ali

        if clat_writer.is_open():
            if acoustic_scale != 0.0:
                scale = acoustic_lattice_scale(1.0 / acoustic_scale)
                scale_lattice(scale, best_path)
            best_path = convert_lattice_to_compact_lattice(best_path)
            clat_writer[key] = best_path

        if word_syms:
            syms = convert_indices_to_symbols(word_syms, words)
            print(key, " ".join(syms), file=sys.stderr)

        num_success += 1
        frame_count += features.num_rows
        like = - (weight.value1 + weight.value2);
        tot_like += like
        logging.info("Log-like per frame for utterance {} is {} over {} "
                     "frames.".format(key, like / features.num_rows,
                                      features.num_rows))
        logging.debug("Cost for utterance {} is {} + {}"
                      .format(key, weight.value1, weight.value2))

    elapsed = time.time() - start
    logging.info("Time taken [excluding initialization] {}s: real-time factor "
                 "assuming 100 frames/sec is {}"
                 .format(elapsed, elapsed * 100 / frame_count))
    logging.info("Done {} utterances, failed for {}"
                 .format(num_success, num_fail))
    logging.info("Overall log-likelihood per frame is {} over {} frames."
                 .format(tot_like / frame_count, frame_count))

    feature_reader.close()
    words_writer.close()
    if alignment_writer.is_open():
        alignment_writer.close()
    if clat_writer.is_open():
        clat_writer.close()

    return True if num_success != 0 else False
Beispiel #6
0
def asr(filenameS_hash, filenameS, asr_beamsize=13, asr_max_active=8000):
    models_dir = "models/"

    # Read yaml File
    config_file = "models/kaldi_tuda_de_nnet3_chain2.yaml"
    with open(config_file, 'r') as stream:
        model_yaml = yaml.safe_load(stream)
    decoder_yaml_opts = model_yaml['decoder']

    scp_filename = "tmp/%s.scp" % filenameS_hash
    wav_filename = "tmp/%s.wav" % filenameS_hash
    spk2utt_filename = "tmp/%s_spk2utt" % filenameS_hash

    # write scp file
    with open(scp_filename, 'w') as scp_file:
        scp_file.write("%s tmp/%s.wav\n" % (filenameS_hash, filenameS_hash))

    # write scp file
    with open(spk2utt_filename, 'w') as scp_file:
        scp_file.write("%s %s\n" % (filenameS_hash, filenameS_hash))

    # use ffmpeg to convert the input media file (any format!) to 16 kHz wav mono
    (
        ffmpeg
            .input(filename)
            .output("tmp/%s.wav" % filenameS_hash, acodec='pcm_s16le', ac=1, ar='16k')
            .overwrite_output()
            .run()
    )

    # Construct recognizer
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = asr_beamsize
    decoder_opts.max_active = asr_max_active
    decodable_opts = NnetSimpleComputationOptions()
    decodable_opts.acoustic_scale = 1.0
    decodable_opts.frame_subsampling_factor = 3
    decodable_opts.frames_per_chunk = 150
    asr = NnetLatticeFasterRecognizer.from_files(
        models_dir + decoder_yaml_opts["model"],
        models_dir + decoder_yaml_opts["fst"],
        models_dir + decoder_yaml_opts["word-syms"],
        decoder_opts=decoder_opts, decodable_opts=decodable_opts)

    # Construct symbol table
    symbols = SymbolTable.read_text(models_dir + decoder_yaml_opts["word-syms"])
    phi_label = symbols.find_index("#0")

    # Define feature pipelines as Kaldi rspecifiers
    feats_rspec = ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:- |") % \
                  (models_dir + decoder_yaml_opts["mfcc-config"])
    ivectors_rspec = (
            ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:-"
             + " | ivector-extract-online2 --config=%s ark:" + spk2utt_filename + " ark:- ark:- |") %
            ((models_dir + decoder_yaml_opts["mfcc-config"]),
             (models_dir + decoder_yaml_opts["ivector-extraction-config"]))
    )

    did_decode = False
    # Decode wav files
    with SequentialMatrixReader(feats_rspec) as f, \
            SequentialMatrixReader(ivectors_rspec) as i:
        for (fkey, feats), (ikey, ivectors) in zip(f, i):
            did_decode = True
            assert (fkey == ikey)
            out = asr.decode((feats, ivectors))
            best_path = functions.compact_lattice_shortest_path(out["lattice"])
            words, _, _ = get_linear_symbol_sequence(shortestpath(best_path))
            timing = functions.compact_lattice_to_word_alignment(best_path)

    assert(did_decode)

    # Maps words to the numbers
    words = indices_to_symbols(symbols, timing[0])

    # Creates the datastructure (Word, begin(Frames), end(Frames))
    vtt = list(map(list, zip(words, timing[1], timing[2])))

    # Cleanup tmp files
    print('removing tmp file:', scp_filename)
    os.remove(scp_filename)
    print('removing tmp file:', wav_filename)
    os.remove(wav_filename)
    print('removing tmp file:', spk2utt_filename)
    os.remove(spk2utt_filename)
    return vtt, words
Beispiel #7
0
import os

from kaldi.alignment import NnetAligner
from kaldi.fstext import SymbolTable
from kaldi.lat.align import WordBoundaryInfoNewOpts, WordBoundaryInfo
from kaldi.nnet3 import NnetSimpleComputationOptions
from kaldi.util.table import SequentialMatrixReader

# Construct aligner
decodable_opts = NnetSimpleComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
aligner = NnetAligner.from_files("final.mdl", "tree", "L.fst", "words.txt",
                                 "disambig.int", decodable_opts=decodable_opts)
phones = SymbolTable.read_text("phones.txt")
wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(),
                                     "word_boundary.int")

# Define feature pipelines as Kaldi rspecifiers
feats_rspec = "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:- |"
ivectors_rspec = (
    "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:-"
    " | ivector-extract-online2 --config=ivector.conf ark:spk2utt ark:- ark:- |"
    )

# Align wav files
with SequentialMatrixReader(feats_rspec) as f, \
     SequentialMatrixReader(ivectors_rspec) as i, open("text") as t:
    for (fkey, feats), (ikey, ivectors), line in zip(f, i, t):
        tkey, text = line.strip().split(None, 1)
# Construct recognizer
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13
decoder_opts.max_active = 7000
decodable_opts = NnetSimpleComputationOptions()
decodable_opts.acoustic_scale = 1.0
decodable_opts.frame_subsampling_factor = 3
decodable_opts.frames_per_chunk = 150
asr = NnetLatticeFasterRecognizer.from_files("final.mdl",
                                             "HCLG.fst",
                                             decoder_opts=decoder_opts,
                                             decodable_opts=decodable_opts)

# Construct RNNLM rescorer
symbols = SymbolTable.read_text("lm/words.txt")
rnnlm_opts = RnnlmComputeStateComputationOptions()
rnnlm_opts.bos_index = symbols.find_index("<s>")
rnnlm_opts.eos_index = symbols.find_index("</s>")
rnnlm_opts.brk_index = symbols.find_index("<brk>")
compose_opts = ComposeLatticePrunedOptions()
compose_opts.lattice_compose_beam = 4
rescorer = LatticeRnnlmPrunedRescorer.from_files(
    "lm/G.carpa",
    "rnnlm-get-word-embedding lm/word_feats.txt lm/feat_embedding.final.mat -|",
    "lm/final.raw",
    acoustic_scale=1.0,
    max_ngram_order=4,
    use_const_arpa=True,
    opts=rnnlm_opts,
    compose_opts=compose_opts)
Beispiel #9
0
    acoustic_model = AmDiagGmm().read(ki.stream(), ki.binary)


# Define the decodable wrapper: (features, acoustic_scale) -> decodable
def make_decodable_wrapper(trans_model, acoustic_model):
    def decodable_wrapper(features, acoustic_scale):
        return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features,
                                        acoustic_scale)

    return decodable_wrapper


decodable_wrapper = make_decodable_wrapper(trans_model, acoustic_model)

# Define the decoder
decoding_graph = read_fst_kaldi("models/mono/graph/HCLG.fst")
decoder_opts = LatticeFasterDecoderOptions()
decoder_opts.beam = 13.0
decoder_opts.lattice_beam = 6.0
decoder = LatticeFasterDecoder(decoding_graph, decoder_opts)

# Define the recognizer
symbols = SymbolTable.read_text("models/mono/graph/words.txt")
asr = Recognizer(decoder, decodable_wrapper, symbols)

# Decode wave files
# for key, wav in SequentialWaveReader("scp:wav.scp"):
# feats = feat_pipeline(wav)
# out = asr.decode(feats)
# print(key, out["text"], flush=True)
Beispiel #10
0
sample_list_path = 'epadb_full_path_list'
epadb_root_path = 'EpaDB'

mfccs_rspec = ("ark:" + data_path + "/mfccs.ark")

ivectors_rspec = ("ark:" + data_path + "/ivectors.ark")

loglikes_wspec = "ark:gop/loglikes.ark"

aligner = MappedAligner.from_files(transition_model_path,
                                   tree,
                                   lang_graph,
                                   symbols_path,
                                   disam,
                                   acoustic_scale=1.0)
phones = SymbolTable.read_text(phones)
wb_info = WordBoundaryInfo.from_file(
    WordBoundaryInfoNewOpts(),
    "data/lang_test_tgsmall/phones/word_boundary.int")

# Instantiate the PyTorch acoustic model (subclass of torch.nn.Module)
model = FTDNN()
model.load_state_dict(torch.load(acoustic_model_path))
model.eval()

#Create feature manager
feature_manager = FeatureManager(epadb_root_path, data_path, conf_path)

align_out_file = open("gop/align_output", "w+")
# Decode and write output lattices
with DoubleMatrixWriter(loglikes_wspec) as loglikes_writer:
Beispiel #11
0
    def __init__(
        self,
        cfg: KaldiDecoderConfig,
        beam: int,
        nbest: int = 1,
    ):
        try:
            from kaldi.asr import FasterRecognizer, LatticeFasterRecognizer
            from kaldi.base import set_verbose_level
            from kaldi.decoder import (
                FasterDecoder,
                FasterDecoderOptions,
                LatticeFasterDecoder,
                LatticeFasterDecoderOptions,
            )
            from kaldi.lat.functions import DeterminizeLatticePhonePrunedOptions
            from kaldi.fstext import read_fst_kaldi, SymbolTable
        except:
            warnings.warn(
                "pykaldi is required for this functionality. Please install from https://github.com/pykaldi/pykaldi"
            )

        # set_verbose_level(2)

        self.acoustic_scale = cfg.acoustic_scale
        self.nbest = nbest

        if cfg.hlg_graph_path is None:
            assert (
                cfg.kaldi_initializer_config is not None
            ), "Must provide hlg graph path or kaldi initializer config"
            cfg.hlg_graph_path = initalize_kaldi(cfg.kaldi_initializer_config)

        assert os.path.exists(cfg.hlg_graph_path), cfg.hlg_graph_path

        if cfg.is_lattice:
            self.dec_cls = LatticeFasterDecoder
            opt_cls = LatticeFasterDecoderOptions
            self.rec_cls = LatticeFasterRecognizer
        else:
            assert self.nbest == 1, "nbest > 1 requires lattice decoder"
            self.dec_cls = FasterDecoder
            opt_cls = FasterDecoderOptions
            self.rec_cls = FasterRecognizer

        self.decoder_options = opt_cls()
        self.decoder_options.beam = beam
        self.decoder_options.max_active = cfg.max_active
        self.decoder_options.beam_delta = cfg.beam_delta
        self.decoder_options.hash_ratio = cfg.hash_ratio

        if cfg.is_lattice:
            self.decoder_options.lattice_beam = cfg.lattice_beam
            self.decoder_options.prune_interval = cfg.prune_interval
            self.decoder_options.determinize_lattice = cfg.determinize_lattice
            self.decoder_options.prune_scale = cfg.prune_scale
            det_opts = DeterminizeLatticePhonePrunedOptions()
            det_opts.max_mem = cfg.max_mem
            det_opts.phone_determinize = cfg.phone_determinize
            det_opts.word_determinize = cfg.word_determinize
            det_opts.minimize = cfg.minimize
            self.decoder_options.det_opts = det_opts

        self.output_symbols = {}
        with open(cfg.output_dict, "r") as f:
            for line in f:
                items = line.rstrip().split()
                assert len(items) == 2
                self.output_symbols[int(items[1])] = items[0]

        logger.info(f"Loading FST from {cfg.hlg_graph_path}")
        self.fst = read_fst_kaldi(cfg.hlg_graph_path)
        self.symbol_table = SymbolTable.read_text(cfg.output_dict)

        self.executor = ThreadPoolExecutor(max_workers=cfg.num_threads)
Beispiel #12
0
# Define the decodable wrapper: (features, acoustic_scale) -> decodable
def make_decodable_wrapper(trans_model, acoustic_model):
    def decodable_wrapper(features, acoustic_scale):
        return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features,
                                        acoustic_scale)

    return decodable_wrapper


decodable_wrapper = make_decodable_wrapper(trans_model, acoustic_model)

# Define the decoder
decoding_graph = read_fst_kaldi(
    "/home/dogan/tools/pykaldi/egs/models/wsj/HCLG.fst")
decoder_opts = FasterDecoderOptions()
decoder_opts.beam = 13
decoder_opts.max_active = 7000
decoder = FasterDecoder(decoding_graph, decoder_opts)

# Define the recognizer
symbols = SymbolTable.read_text(
    "/home/dogan/tools/pykaldi/egs/models/wsj/words.txt")
asr = Recognizer(decoder, decodable_wrapper, symbols)

# Decode wave files
for key, wav in SequentialWaveReader(
        "scp:/home/dogan/tools/pykaldi/egs/decoder/test2.scp"):
    feats = feat_pipeline(wav)
    out = asr.decode(feats)
    print(key, out["text"], flush=True)