def compute_mfcc_feats(wav: WaveData, mfcc_opts: MfccOptions) -> Matrix: """Compute MFCC features given a Kaldi WaveData. Args: wav: A WaveData object. mfcc_opts: An MfccOptions object containing feature extraction options. A few notable options are, - use_energy: Generally I will use False, since the energy does not contain much linguistic information - frame_opts.allow_downsample: Generally I will set this to True, since the AM I use can only handle the default sampling frequency (16KHz) - frame_opts.frame_shift_ms: For speech synthesis purposes, might be good to have a smaller shift (e.g., 5ms) - frame_opts.snip_edges: Generally I will set this to False, just to have a deterministic way to compute the number of frames Returns: feats: A T*D MFCC feature matrix. """ mfcc = Mfcc(mfcc_opts) vtln_warp = 1.0 # This is the default value channel = 0 # Only use the first channel feats = mfcc.compute_features(wav.data()[channel], wav.samp_freq, vtln_warp) return feats
def extract_mfcc(filename, samp_freq, frame_length_ms=25, frame_shift_ms=10, num_ceps=23, round_to_power_of_two=True, snip_edges=True): ''' extract mfcc using kaldi args: filename: wav file path samp_freq: sample frequence return: mfcc: (frame, fre) ''' # get rspec and wspec with open('wav.scp', 'w') as f: f.write('test1 ' + filename + '\n') rspec = 'scp,p:' + 'wav.scp' wspec = 'ark,t:' + 'spec.ark' # set po usage = """Extract MFCC features.Usage: example.py [opts...] <rspec> <wspec>""" po = ParseOptions(usage) po.register_float("min-duration", 0.0, "minimum segment duration") opts = po.parse_args() # set options mfcc_opts = MfccOptions() mfcc_opts.frame_opts.samp_freq = samp_freq mfcc_opts.num_ceps = num_ceps mfcc_opts.register(po) mfcc = Mfcc(mfcc_opts) sf = mfcc_opts.frame_opts.samp_freq with SequentialWaveReader(rspec) as reader, MatrixWriter(wspec) as writer: for key, wav in reader: if wav.duration < opts.min_duration: continue assert (wav.samp_freq >= sf) assert (wav.samp_freq % sf == 0) s = wav.data() s = s[:, ::int(wav.samp_freq / sf)] m = SubVector(mean(s, axis=0)) f = mfcc.compute_features(m, sf, 1.0) f_array = np.array(f) print(f_array.shape) writer[key] = f return f_array
def lid_module(key, audio_file, start, end): # ================================== # Get data and process it. # ================================== wav_spc = "scp:echo " + key + " 'sox -V0 -t wav " + audio_file + " -c 1 -r 8000 -t wav - trim " + str(start) + " " + str( float(end) - float(start)) + "|' |" hires_mfcc = Mfcc(hires_mfcc_opts) wav = SequentialWaveReader(wav_spc).value() hi_feat = hires_mfcc.compute_features(wav.data()[0], wav.samp_freq, 1.0) hi_feat = hi_feat.numpy() - CMVN X = hi_feat.T X = np.expand_dims(np.expand_dims(X, 0), -1) #print(X.shape) v = network_eval.predict(X) #print(v) #print(key, "::", i2l[v.argmax()]) return i2l[v.argmax()]
def lid_module(key, audio_file, start, end): # ================================== # Get data and process it. # ================================== wav_spc = "scp:echo " + key + " 'sox -V0 -t wav " + audio_file + " -c 1 -r 16000 -t wav - trim " + str( start) + " " + str( float(end) - float(start)) + "|' |" hires_mfcc = Mfcc(hires_mfcc_opts) wav = SequentialWaveReader(wav_spc).value() hi_feat = hires_mfcc.compute_features(wav.data()[0], wav.samp_freq, 1.0) hi_feat = hi_feat.numpy() - CMVN X = hi_feat.T print(X.shape) if X.shape[1] >= 384: X = np.expand_dims(X[:,:384], 0) else: padded_x = torch.zeros(40, 384) padded_x[:,:X.shape[1]] = torch.from_numpy(X) X = np.expand_dims(padded_x, 0) print(X.shape) emb = nn_LID_model_DA.emb(torch.from_numpy(X))[0] print(emb.shape)
def compute_feat_KALDI(self, wav): try: po = ParseOptions("") mfcc_opts = MfccOptions() mfcc_opts.use_energy = False mfcc_opts.frame_opts.samp_freq = self.sr mfcc_opts.frame_opts.frame_length_ms = self.frame_length_s*1000 mfcc_opts.frame_opts.frame_shift_ms = self.frame_shift_s*1000 mfcc_opts.frame_opts.allow_downsample = False mfcc_opts.mel_opts.num_bins = self.num_bins mfcc_opts.mel_opts.low_freq = self.low_freq mfcc_opts.mel_opts.high_freq = self.high_freq mfcc_opts.num_ceps = self.num_ceps mfcc_opts.register(po) # Create MFCC object and obtain sample frequency mfccObj = Mfcc(mfcc_opts) mfccKaldi = mfccObj.compute_features(wav, self.sr, 1.0) except Exception as e: self.log.error(e) raise ValueError( "Speaker diarization failed while extracting features!!!") else: return mfccKaldi
print(key, out["text"], flush=True) print("-" * 80, flush=True) # Define feature pipeline in code def make_feat_pipeline(base, opts=DeltaFeaturesOptions()): def feat_pipeline(wav): feats = base.compute_features(wav.data()[0], wav.samp_freq, 1.0) cmvn = Cmvn(base.dim()) cmvn.accumulate(feats) cmvn.apply(feats) return compute_deltas(opts, feats) return feat_pipeline frame_opts = FrameExtractionOptions() frame_opts.samp_freq = 16000 frame_opts.allow_downsample = True mfcc_opts = MfccOptions() mfcc_opts.use_energy = False mfcc_opts.frame_opts = frame_opts feat_pipeline = make_feat_pipeline(Mfcc(mfcc_opts)) # Decode for key, wav in SequentialWaveReader("scp:wav.scp"): feats = feat_pipeline(wav) out = asr.decode(feats) print(key, out["text"], flush=True)
def compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts): mfcc = Mfcc(mfcc_opts) if opts.vtln_map: vtln_map_reader = RandomAccessFloatReaderMapped( opts.vtln_map, opts.utt2spk) elif opts.utt2spk: print("utt2spk option is needed only if vtln-map option is specified.", file=sys.stderr) num_utts, num_success = 0, 0 with SequentialWaveReader(wav_rspecifier) as reader, \ MatrixWriter(feats_wspecifier) as writer: for num_utts, (key, wave) in enumerate(reader, 1): if wave.duration < opts.min_duration: print("File: {} is too short ({} sec): producing no output.". format(key, wave.duration), file=sys.stderr) continue num_chan = wave.data().num_rows if opts.channel >= num_chan: print( "File with id {} has {} channels but you specified " "channel {}, producing no output.", file=sys.stderr) continue channel = 0 if opts.channel == -1 else opts.channel if opts.vtln_map: if key not in vtln_map_reader: print("No vtln-map entry for utterance-id (or speaker-id)", key, file=sys.stderr) continue vtln_warp = vtln_map_reader[key] else: vtln_warp = opts.vtln_warp try: feats = mfcc.compute_features(wave.data()[channel], wave.samp_freq, vtln_warp) except: print("Failed to compute features for utterance", key, file=sys.stderr) continue if opts.subtract_mean: mean = Vector(feats.num_cols) mean.add_row_sum_mat_(1.0, feats) mean.scale_(1.0 / feats.num_rows) for i in range(feats.num_rows): feats[i].add_vec_(-1.0, mean) writer[key] = feats num_success += 1 if num_utts % 10 == 0: print("Processed {} utterances".format(num_utts), file=sys.stderr) print("Done {} out of {} utterances".format(num_success, num_utts), file=sys.stderr) if opts.vtln_map: vtln_map_reader.close() return num_success != 0
voice_feats.row(index).copy_row_from_mat_(feats, i) index += 1 LOG.debug('Feats extract successed') return voice_feats return feat_pipeline mfcc_opts = MfccOptions() mfcc_opts.frame_opts.samp_freq = 16000 mfcc_opts.frame_opts.allow_downsample = True mfcc_opts.mel_opts.num_bins = 40 mfcc_opts.num_ceps = 20 mfcc_opts.use_energy = True mfcc = Mfcc(mfcc_opts) sliding_opts = SlidingWindowCmnOptions() sliding_opts.cmn_window = 300 sliding_opts.normalize_variance = False sliding_opts.center = True vad_opts = VadEnergyOptions() vad_opts.vad_energy_threshold = 5.5 vad_opts.vad_energy_mean_scale = 0.5 delta_opts = DeltaFeaturesOptions() delta_opts.window = 3 delta_opts.order = 2 feat_pipeline = make_feat_pipeline(mfcc, sliding_opts, vad_opts, delta_opts)
def compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts): mfcc = Mfcc(mfcc_opts) # Shift by label window length so that feats align lab_window_len_sample = int( (opts.sampling_rate * opts.label_window_length) / 1000) lab_window_shift_sample = int( (opts.sampling_rate * opts.label_window_shift) / 1000) sig_window_len_sample = int( (opts.sampling_rate * opts.signal_window_length) / 1000) num_utts, num_success = 0, 0 with SequentialWaveReader(wav_rspecifier) as reader, \ MatrixWriter(feats_wspecifier) as writer: for num_utts, (key, wave) in enumerate(reader, 1): if wave.duration < opts.min_duration: print("File: {} is too short ({} sec): producing no output.". format(key, wave.duration), file=sys.stderr) continue num_chan = wave.data().num_rows if opts.channel >= num_chan: print( "File with id {} has {} channels but you specified " "channel {}, producing no output.", file=sys.stderr) continue channel = 0 if opts.channel == -1 else opts.channel try: # Move signal from integers to floats signal = wave.data()[channel].numpy() signal = signal.astype(float) / 2**15 # 32768 # int to float signal /= np.max(np.abs(signal)) # normalise # Extract windows feats = extract_windows(signal, sig_window_len_sample, lab_window_len_sample, lab_window_shift_sample) except: print("Failed to compute features for utterance", key, file=sys.stderr) continue if opts.subtract_mean: mean = Vector(feats.num_cols) mean.add_row_sum_mat_(1.0, feats) mean.scale_(1.0 / feats.num_rows) for i in range(feats.num_rows): feats[i].add_vec_(-1.0, mean) writer[key] = feats num_success += 1 if num_utts % 10 == 0: print("Processed {} utterances".format(num_utts), file=sys.stderr) print("Done {} out of {} utterances".format(num_success, num_utts), file=sys.stderr) return num_success != 0
from kaldi.gmm.am import AmDiagGmm, DecodableAmDiagGmmScaled from kaldi.hmm import TransitionModel from kaldi.util.io import xopen from kaldi.util.table import SequentialWaveReader # Define the feature pipeline: (wav) -> feats def make_feat_pipeline(base, opts=DeltaFeaturesOptions()): def feat_pipeline(wav): feats = base.compute_features(wav.data()[0], wav.samp_freq, 1.0) return compute_deltas(opts, feats) return feat_pipeline feat_pipeline = make_feat_pipeline(Mfcc(MfccOptions())) # Read the model with xopen("/home/dogan/tools/pykaldi/egs/models/wsj/final.mdl") as ki: trans_model = TransitionModel().read(ki.stream(), ki.binary) acoustic_model = AmDiagGmm().read(ki.stream(), ki.binary) # Define the decodable wrapper: (features, acoustic_scale) -> decodable def make_decodable_wrapper(trans_model, acoustic_model): def decodable_wrapper(features, acoustic_scale): return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features, acoustic_scale) return decodable_wrapper