def __init__(self, idx_to_char, params={}): self.idx_to_char = idx_to_char self.reorder_1, self.reorder_2 = create_phone_map( params['phones_path'], idx_to_char) self.word_syms = SymbolTable.read_text(params['words_path']) self.acoustic_scale = params.get('acoustic', 1.2) if self.acoustic_scale < 0: print("Warning: acoustic scale is less than 0") allow_partial = params.get('allow_partial', True) beam = params.get('beam', 13) self.alphaweight = params.get('alphaweight', 0.3) trans_model = TransitionModel() with xopen(params['mdl_path']) as ki: trans_model.read(ki.stream(), ki.binary) decoder_opts = FasterDecoderOptions() decoder_opts.beam = beam decode_fst = read_fst_kaldi(params['fst_path']) self.decoder_opts = decoder_opts self.trans_model = trans_model self.decode_fst = decode_fst self.stats = LMStats() self.stats_state = None self.add_stats_phase = True
def read_trans_model(model_path: str) -> hmm.TransitionModel: """Read in a transition model stored in the header of a .mdl file. Args: model_path: Path to a .mdl file. Returns: The transition model. """ with xopen(model_path) as istream: trans_model = hmm.TransitionModel().read(istream.stream(), istream.binary) return trans_model
def read_sparse_mat(sparse_mat_dir: str) -> SparseMatrix: """Read in a sparse matrix. Args: sparse_mat_dir: Path to the sparse matrix file. Returns: mat: A sparse matrix. """ with xopen(sparse_mat_dir, 'r') as reader: mat = SparseMatrix() mat.read_(reader.stream(), reader.binary) return mat
def from_kaldi(cls, filename, device): ubm = KaldiFullGmm() with kio.xopen(filename) as f: ubm.read(f.stream(), f.binary) means = torch.from_numpy(ubm.get_means().numpy()) weights = torch.from_numpy(ubm.weights().numpy()) n_components = weights.numel() feat_dim = means.size()[1] covariances = torch.zeros([n_components, feat_dim, feat_dim], device='cpu', dtype=torch.float32) for index, kaldicovar in enumerate(ubm.get_covars()): covariances[index, :, :] = torch.from_numpy( KaldiMatrix(kaldicovar).numpy()) return Gmm(means, covariances, weights, device=device)
def read_nnet3_model(model_path: str) -> nnet3.Nnet: """Read in a nnet3 model in raw format. Actually if this model is not a raw format it will still work, but this is not an official feature; it was due to some kaldi internal code. Args: model_path: Path to a raw nnet3 model, e.g., "data/final.raw" Returns: nnet: A neural network AM. """ nnet = nnet3.Nnet() with xopen(model_path) as istream: nnet.read(istream.stream(), istream.binary) return nnet
def get_diag_gmm_params(file_diag, out_dir): if not os.path.isfile(file_diag): print("File {0} does not exist!".format(file_diag)) else: print("Getting GMM stats from", format(os.path.basename(file_diag))) diag_mdl = io.xopen(file_diag) # reading .mdl or .ubm file gmm = DiagGmm() # creating DiagGmm object gmm.read(diag_mdl.stream(), diag_mdl.binary) # reading model file_name = os.path.basename(file_diag) vars = np.asanyarray(gmm.get_vars()) means = np.asanyarray(gmm.get_means()) weights = np.asanyarray(gmm.weights()) # priors np.savetxt(out_dir + '{}_variances.dubm'.format(file_name), vars) np.savetxt(out_dir + '{}_means.dubm'.format(file_name), means) np.savetxt(out_dir + '{}_weights.dubm'.format(file_name), weights) print("Vars, means and weights saved to:", out_dir) return vars, means, weights, gmm.num_gauss()
def get_utterance_pairs(reco2file_and_channel_rxfilename): utt_pairs = [] call_to_uttlist = defaultdict(list) for line in xopen(reco2file_and_channel_rxfilename, "rt"): try: utt, call, _ = line.split() # lines like: sw02001-A sw02001 A except: filename = printable_rxfilename(reco2file_and_channel_rxfilename) raise ValueError("Expecting 3 fields per line of " "reco2file_and_channel file {}, got: {}".format( filename, len(line.split()))) call_to_uttlist[call].append(utt) for key, uttlist in call_to_uttlist.items(): if len(uttlist) == 2: utt_pairs.append(uttlist) else: print("Call {} has {} utterances, expected two; treating them " "singly.".format(key, len(uttlist)), file=sys.stderr) utt_pairs.extend([x] for x in uttlist) return utt_pairs
def gmm_decode_faster(model_rxfilename, fst_rxfilename, feature_rspecifier, words_wspecifier, alignment_wspecifier="", lattice_wspecifier="", word_symbol_table="", acoustic_scale=0.1, allow_partial=True, decoder_opts=FasterDecoderOptions()): # Read model. trans_model = TransitionModel() am_gmm = AmDiagGmm() with xopen(model_rxfilename) as ki: trans_model.read(ki.stream(), ki.binary) am_gmm.read(ki.stream(), ki.binary) # Open table readers/writers. feature_reader = SequentialMatrixReader(feature_rspecifier) words_writer = IntVectorWriter(words_wspecifier) alignment_writer = IntVectorWriter(alignment_wspecifier) clat_writer = CompactLatticeWriter(lattice_wspecifier) # Read symbol table. word_syms = None if word_symbol_table != "": word_syms = SymbolTable.read_text(word_symbol_table) if not word_syms: raise RuntimeError("Could not read symbol table from file {}" .format(word_symbol_table)) # NOTE: # It is important to read decode_fst after opening feature reader as # it can prevent crashes on systems without enough virtual memory. # Read decoding graph and instantiate decoder. decode_fst = read_fst_kaldi(fst_rxfilename) decoder = FasterDecoder(decode_fst, decoder_opts) tot_like = 0.0 frame_count = 0 num_success, num_fail = 0, 0 start = time.time() for key, features in feature_reader: if features.num_rows == 0: num_fail += 1 logging.warning("Zero-length utterance: {}".format(key)) continue gmm_decodable = DecodableAmDiagGmmScaled(am_gmm, trans_model, features, acoustic_scale) decoder.decode(gmm_decodable) if not (allow_partial or decoder.reached_final()): num_fail += 1 logging.warning("Did not successfully decode utterance {}, len = {}" .format(key, features.num_rows)) continue try: best_path = decoder.get_best_path() except RuntimeError: num_fail += 1 logging.warning("Did not successfully decode utterance {}, len = {}" .format(key, features.num_rows)) continue if not decoder.reached_final(): logging.warning("Decoder did not reach end-state, outputting " "partial traceback since --allow-partial=true") ali, words, weight = get_linear_symbol_sequence(best_path) words_writer[key] = words if alignment_writer.is_open(): alignment_writer[key] = ali if clat_writer.is_open(): if acoustic_scale != 0.0: scale = acoustic_lattice_scale(1.0 / acoustic_scale) scale_lattice(scale, best_path) best_path = convert_lattice_to_compact_lattice(best_path) clat_writer[key] = best_path if word_syms: syms = convert_indices_to_symbols(word_syms, words) print(key, " ".join(syms), file=sys.stderr) num_success += 1 frame_count += features.num_rows like = - (weight.value1 + weight.value2); tot_like += like logging.info("Log-like per frame for utterance {} is {} over {} " "frames.".format(key, like / features.num_rows, features.num_rows)) logging.debug("Cost for utterance {} is {} + {}" .format(key, weight.value1, weight.value2)) elapsed = time.time() - start logging.info("Time taken [excluding initialization] {}s: real-time factor " "assuming 100 frames/sec is {}" .format(elapsed, elapsed * 100 / frame_count)) logging.info("Done {} utterances, failed for {}" .format(num_success, num_fail)) logging.info("Overall log-likelihood per frame is {} over {} frames." .format(tot_like / frame_count, frame_count)) feature_reader.close() words_writer.close() if alignment_writer.is_open(): alignment_writer.close() if clat_writer.is_open(): clat_writer.close() return True if num_success != 0 else False
def extract_segments(wav_rspecifier, segments_rxfilename, wav_wspecifier, opts): with RandomAccessWaveReader(wav_rspecifier) as reader, \ WaveWriter(wav_wspecifier) as writer: num_success, num_lines = 0, 0 for num_lines, line in enumerate(xopen(segments_rxfilename, "rt"), 1): # segments file format: # segment-name wav-name start-time end-time [channel] try: segment, recording, start, end = line.split() channel = None except: try: segment, recording, start, end, channel = line.split() except: logging.warning( "Invalid line in segments file: {}".format(line)) continue try: start = float(start) except: logging.warning( "Invalid line in segments file [bad start]: {}".format( line)) continue try: end = float(end) except: logging.warning( "Invalid line in segments file [bad end]: {}".format(line)) continue if ((start < 0 or (end != -1.0 and end <= 0)) or (start >= end and end > 0)): logging.warning("Invalid line in segments file [empty or " "invalid segment]: {}".format(line)) continue try: if channel: channel = int(channel) except: logging.warning("Invalid line in segments file " "[bad channel]: {}".format(line)) continue if not recording in reader: logging.warning("Could not find recording {}, skipping " "segment {}".format(recording, segment)) continue wave = reader[recording] wave_data = wave.data() samp_freq = wave.samp_freq num_chan, num_samp = wave_data.shape # Convert starting time of the segment to corresponding sample # number. If end time is -1 then use the whole file starting # from start time. start_samp = start * samp_freq end_samp = end * samp_freq if end != -1 else num_samp assert start_samp >= 0 and end_samp > 0, "Invalid start or end." # start sample must be less than total number of samples, # otherwise skip the segment if start_samp < 0 or start_samp >= num_samp: logging.warning("Start sample out of range {} [length:] {}, " "skipping segment {}".format( start_samp, num_samp, segment)) continue # end sample must be less than total number samples # otherwise skip the segment if end_samp > num_samp: if end_samp >= num_samp + int(opts.max_overshoot * samp_freq): logging.warning("End sample too far out of range {} " "[length:] {}, skipping segment {}".format( end_samp, num_samp, segment)) continue end_samp = num_samp #for small differences, just truncate. # Skip if segment size is less than minimum segment length # (default 0.1s) min_samp = int(opts.min_segment_length * samp_freq) if end_samp <= start_samp + min_samp: logging.warning( "Segment {} too short, skipping it!".format(segment)) continue # check whether the wav file has more than one channel # if yes, specify the channel info in segments file # otherwise skips the segment if channel is None: if num_chan == 1: channel = 0 else: raise ValuError( "If your data has multiple channels, you " "must specify the channel in the segments " "file. Processing segment {}".format(segment)) else: if channel >= num_chan: logging.warning( "Invalid channel {} >= {}, skipping segment" " {}".format(channel, num_chan, segment)) continue segment_matrix = SubMatrix(wave_data, channel, 1, int(start_samp), int(end_samp - start_samp)) segment_wave = WaveData.new(samp_freq, segment_matrix) writer[segment] = segment_wave # write segment in wave format num_success += 1 logging.info("Succesfully processed {} lines out of {} in the " "segments file".format(num_success, num_lines))
vad_opts = VadEnergyOptions() vad_opts.vad_energy_threshold = 5.5 vad_opts.vad_energy_mean_scale = 0.5 delta_opts = DeltaFeaturesOptions() delta_opts.window = 3 delta_opts.order = 2 feat_pipeline = make_feat_pipeline(mfcc, sliding_opts, vad_opts, delta_opts) try: LOG.info('Loading ubm...') if not os.path.exists('app/extractor/final.ubm'): LOG.error('Not Found extractor/final.ubm, please recheck file') exit(1) with xopen('app/extractor/final.ubm') as ki: fgmm = FullGmm() fgmm.read(ki.stream(), ki.binary) gmm = DiagGmm() gmm.copy_from_full(fgmm) if not os.path.exists('app/extractor/final.ie'): LOG.error('Not Found app/extractor/final.ie, please recheck file') exit(1) with xopen('app/extractor/final.ie') as ki: extractor_ = IvectorExtractor() extractor_.read(ki.stream(), ki.binary) LOG.info('IvectorExtractor ready') except Exception:
cmvn.accumulate(feats) cmvn.apply(feats) return compute_deltas(opts, feats) return feat_pipeline mfcc_opts = MfccOptions() mfcc_opts.frame_opts.samp_freq = 44100 mfcc_opts.frame_opts.allow_downsample = True mfcc_opts.use_energy = False feat_pipeline = make_feat_pipeline(Mfcc(mfcc_opts)) # Read the model with xopen("models/mono/final.mdl") as ki: trans_model = TransitionModel().read(ki.stream(), ki.binary) acoustic_model = AmDiagGmm().read(ki.stream(), ki.binary) # Define the decodable wrapper: (features, acoustic_scale) -> decodable def make_decodable_wrapper(trans_model, acoustic_model): def decodable_wrapper(features, acoustic_scale): return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features, acoustic_scale) return decodable_wrapper decodable_wrapper = make_decodable_wrapper(trans_model, acoustic_model)
from kaldi.util.table import SequentialWaveReader # Define the feature pipeline: (wav) -> feats def make_feat_pipeline(base, opts=DeltaFeaturesOptions()): def feat_pipeline(wav): feats = base.compute_features(wav.data()[0], wav.samp_freq, 1.0) return compute_deltas(opts, feats) return feat_pipeline feat_pipeline = make_feat_pipeline(Mfcc(MfccOptions())) # Read the model with xopen("/home/dogan/tools/pykaldi/egs/models/wsj/final.mdl") as ki: trans_model = TransitionModel().read(ki.stream(), ki.binary) acoustic_model = AmDiagGmm().read(ki.stream(), ki.binary) # Define the decodable wrapper: (features, acoustic_scale) -> decodable def make_decodable_wrapper(trans_model, acoustic_model): def decodable_wrapper(features, acoustic_scale): return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features, acoustic_scale) return decodable_wrapper decodable_wrapper = make_decodable_wrapper(trans_model, acoustic_model)