def LoadModels(self): try: # Define online feature pipeline po = ParseOptions("") decoder_opts = LatticeFasterDecoderOptions() self.endpoint_opts = OnlineEndpointConfig() self.decodable_opts = NnetSimpleLoopedComputationOptions() feat_opts = OnlineNnetFeaturePipelineConfig() decoder_opts.register(po) self.endpoint_opts.register(po) self.decodable_opts.register(po) feat_opts.register(po) po.read_config_file(self.CONFIG_FILES_PATH + "/online.conf") self.feat_info = OnlineNnetFeaturePipelineInfo.from_config( feat_opts) # Set metadata parameters self.samp_freq = self.feat_info.mfcc_opts.frame_opts.samp_freq self.frame_shift = self.feat_info.mfcc_opts.frame_opts.frame_shift_ms / 1000 self.acwt = self.decodable_opts.acoustic_scale # Load Acoustic and graph models and other files self.transition_model, self.acoustic_model = NnetRecognizer.read_model( self.AM_PATH + "/final.mdl") graph = _fst.read_fst_kaldi(self.LM_PATH + "/HCLG.fst") self.decoder_graph = LatticeFasterOnlineDecoder( graph, decoder_opts) self.symbols = _fst.SymbolTable.read_text(self.LM_PATH + "/words.txt") self.info = WordBoundaryInfo.from_file( WordBoundaryInfoNewOpts(), self.LM_PATH + "/word_boundary.int") self.asr = NnetLatticeFasterOnlineRecognizer( self.transition_model, self.acoustic_model, self.decoder_graph, self.symbols, decodable_opts=self.decodable_opts, endpoint_opts=self.endpoint_opts) del graph, decoder_opts except Exception as e: self.log.error(e) raise ValueError( "AM and LM loading failed!!! (see logs for more details)")
from kaldi.nnet3 import NnetSimpleComputationOptions from kaldi.util.table import SequentialMatrixReader # Construct aligner decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 aligner = NnetAligner.from_files("exp/tdnn_7b_chain_online/final.mdl", "exp/tdnn_7b_chain_online/tree", "exp/langdir/L.fst", "exp/langdir/words.txt", "exp/langdir/phones/disambig.int", decodable_opts=decodable_opts) phones = SymbolTable.read_text("exp/langdir/phones.txt") wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(), "exp/langdir/phones/word_boundary.int") # Define feature pipelines as Kaldi rspecifiers feats_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |") ivectors_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |" "ivector-extract-online2 --config=conf/ivector.conf ark:data/spk2utt ark:- ark:- |" ) # Align wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i, \ open("data/text") as t, \ open("out/align.out", "w") as a, \ open("out/phone_align.out", "w") as p, \
from kaldi.nnet3 import NnetSimpleComputationOptions from kaldi.util.table import SequentialMatrixReader # Construct aligner decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 aligner = NnetAligner.from_files("exp/tdnn_7b_chain_online/final.mdl", "exp/tdnn_7b_chain_online/tree", "data/lang/L.fst", "data/lang/words.txt", "data/lang/phones/disambig.int", decodable_opts=decodable_opts) phones = SymbolTable.read_text("data/lang/phones.txt") wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(), "data/lang/phones/word_boundary.int") # Define feature pipelines as Kaldi rspecifiers feats_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |" ) ivectors_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |" "ivector-extract-online2 --config=conf/ivector_extractor.conf ark:data/test/spk2utt ark:- ark:- |" ) # Align wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i, \ open("data/test/text") as t, \ open("out/test/align.out", "w") as a, \
import os from kaldi.alignment import NnetAligner from kaldi.fstext import SymbolTable from kaldi.lat.align import WordBoundaryInfoNewOpts, WordBoundaryInfo from kaldi.nnet3 import NnetSimpleComputationOptions from kaldi.util.table import SequentialMatrixReader # Construct aligner decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 aligner = NnetAligner.from_files("final.mdl", "tree", "L.fst", "words.txt", "disambig.int", decodable_opts=decodable_opts) phones = SymbolTable.read_text("phones.txt") wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(), "word_boundary.int") # Define feature pipelines as Kaldi rspecifiers feats_rspec = "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:- |" ivectors_rspec = ( "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:-" " | ivector-extract-online2 --config=ivector.conf ark:spk2utt ark:- ark:- |" ) # Align wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i, open("text") as t: for (fkey, feats), (ikey, ivectors), line in zip(f, i, t): tkey, text = line.strip().split(None, 1) assert(fkey == ikey == tkey) out = aligner.align((feats, ivectors), text)