Esempio n. 1
0
def compute_mfcc_feats(wav: WaveData, mfcc_opts: MfccOptions) -> Matrix:
    """Compute MFCC features given a Kaldi WaveData.

    Args:
        wav: A WaveData object.
        mfcc_opts: An MfccOptions object containing feature extraction options.
        A few notable options are,
        - use_energy: Generally I will use False, since the energy does not
        contain much linguistic information
        - frame_opts.allow_downsample: Generally I will set this to True, since
        the AM I use can only handle the default sampling frequency (16KHz)
        - frame_opts.frame_shift_ms: For speech synthesis purposes, might be
        good to have a smaller shift (e.g., 5ms)
        - frame_opts.snip_edges: Generally I will set this to False, just to
        have a deterministic way to compute the number of frames

    Returns:
        feats: A T*D MFCC feature matrix.
    """
    mfcc = Mfcc(mfcc_opts)
    vtln_warp = 1.0  # This is the default value
    channel = 0  # Only use the first channel

    feats = mfcc.compute_features(wav.data()[channel], wav.samp_freq,
                                  vtln_warp)
    return feats
Esempio n. 2
0
def extract_mfcc(filename,
                 samp_freq,
                 frame_length_ms=25,
                 frame_shift_ms=10,
                 num_ceps=23,
                 round_to_power_of_two=True,
                 snip_edges=True):
    '''
    extract mfcc using kaldi
    args:
        filename: wav file path
        samp_freq: sample frequence
    return:
        mfcc: (frame, fre)
    '''
    # get rspec and wspec
    with open('wav.scp', 'w') as f:
        f.write('test1 ' + filename + '\n')
    rspec = 'scp,p:' + 'wav.scp'
    wspec = 'ark,t:' + 'spec.ark'
    # set po
    usage = """Extract MFCC features.Usage: example.py [opts...] <rspec> <wspec>"""
    po = ParseOptions(usage)
    po.register_float("min-duration", 0.0, "minimum segment duration")
    opts = po.parse_args()
    # set options
    mfcc_opts = MfccOptions()
    mfcc_opts.frame_opts.samp_freq = samp_freq
    mfcc_opts.num_ceps = num_ceps
    mfcc_opts.register(po)
    mfcc = Mfcc(mfcc_opts)
    sf = mfcc_opts.frame_opts.samp_freq
    with SequentialWaveReader(rspec) as reader, MatrixWriter(wspec) as writer:
        for key, wav in reader:
            if wav.duration < opts.min_duration:
                continue
            assert (wav.samp_freq >= sf)
            assert (wav.samp_freq % sf == 0)
            s = wav.data()
            s = s[:, ::int(wav.samp_freq / sf)]
            m = SubVector(mean(s, axis=0))
            f = mfcc.compute_features(m, sf, 1.0)
            f_array = np.array(f)
            print(f_array.shape)
            writer[key] = f
    return f_array
Esempio n. 3
0
def lid_module(key, audio_file, start, end):
    # ==================================
    #       Get data and process it.
    # ==================================
    wav_spc = "scp:echo " + key + " 'sox -V0 -t wav " + audio_file + " -c 1 -r 8000 -t wav - trim " + str(start) + " " + str(
        float(end) - float(start)) + "|' |"
    hires_mfcc = Mfcc(hires_mfcc_opts)
    wav = SequentialWaveReader(wav_spc).value()
    hi_feat = hires_mfcc.compute_features(wav.data()[0], wav.samp_freq, 1.0)
    hi_feat = hi_feat.numpy() - CMVN
    X = hi_feat.T
    X = np.expand_dims(np.expand_dims(X, 0), -1)
    #print(X.shape)
    v = network_eval.predict(X)
    #print(v)
    #print(key, "::", i2l[v.argmax()])
    return i2l[v.argmax()]
Esempio n. 4
0
def lid_module(key, audio_file, start, end):
    # ==================================
    #       Get data and process it.
    # ==================================
    wav_spc = "scp:echo " + key + " 'sox -V0 -t wav " + audio_file + " -c 1 -r 16000 -t wav - trim " + str(
        start) + " " + str(
        float(end) - float(start)) + "|' |"
    hires_mfcc = Mfcc(hires_mfcc_opts)
    wav = SequentialWaveReader(wav_spc).value()
    hi_feat = hires_mfcc.compute_features(wav.data()[0], wav.samp_freq, 1.0)
    hi_feat = hi_feat.numpy() - CMVN
    X = hi_feat.T
    print(X.shape)
    if X.shape[1] >= 384:
        X = np.expand_dims(X[:,:384], 0)
    else:
        padded_x = torch.zeros(40, 384)
        padded_x[:,:X.shape[1]]	 = torch.from_numpy(X)
        X = np.expand_dims(padded_x, 0)
    print(X.shape)
    emb = nn_LID_model_DA.emb(torch.from_numpy(X))[0]
    print(emb.shape)
    def compute_feat_KALDI(self, wav):
        try:
            po = ParseOptions("")
            mfcc_opts = MfccOptions()
            mfcc_opts.use_energy = False
            mfcc_opts.frame_opts.samp_freq = self.sr
            mfcc_opts.frame_opts.frame_length_ms = self.frame_length_s*1000
            mfcc_opts.frame_opts.frame_shift_ms = self.frame_shift_s*1000
            mfcc_opts.frame_opts.allow_downsample = False
            mfcc_opts.mel_opts.num_bins = self.num_bins
            mfcc_opts.mel_opts.low_freq = self.low_freq
            mfcc_opts.mel_opts.high_freq = self.high_freq
            mfcc_opts.num_ceps = self.num_ceps
            mfcc_opts.register(po)

            # Create MFCC object and obtain sample frequency
            mfccObj = Mfcc(mfcc_opts)
            mfccKaldi = mfccObj.compute_features(wav, self.sr, 1.0)
        except Exception as e:
            self.log.error(e)
            raise ValueError(
                "Speaker diarization failed while extracting features!!!")
        else:
            return mfccKaldi
Esempio n. 6
0
    print(key, out["text"], flush=True)

print("-" * 80, flush=True)


# Define feature pipeline in code
def make_feat_pipeline(base, opts=DeltaFeaturesOptions()):
    def feat_pipeline(wav):
        feats = base.compute_features(wav.data()[0], wav.samp_freq, 1.0)
        cmvn = Cmvn(base.dim())
        cmvn.accumulate(feats)
        cmvn.apply(feats)
        return compute_deltas(opts, feats)

    return feat_pipeline


frame_opts = FrameExtractionOptions()
frame_opts.samp_freq = 16000
frame_opts.allow_downsample = True
mfcc_opts = MfccOptions()
mfcc_opts.use_energy = False
mfcc_opts.frame_opts = frame_opts
feat_pipeline = make_feat_pipeline(Mfcc(mfcc_opts))

# Decode
for key, wav in SequentialWaveReader("scp:wav.scp"):
    feats = feat_pipeline(wav)
    out = asr.decode(feats)
    print(key, out["text"], flush=True)
Esempio n. 7
0
def compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts):
    mfcc = Mfcc(mfcc_opts)

    if opts.vtln_map:
        vtln_map_reader = RandomAccessFloatReaderMapped(
            opts.vtln_map, opts.utt2spk)
    elif opts.utt2spk:
        print("utt2spk option is needed only if vtln-map option is specified.",
              file=sys.stderr)

    num_utts, num_success = 0, 0
    with SequentialWaveReader(wav_rspecifier) as reader, \
         MatrixWriter(feats_wspecifier) as writer:
        for num_utts, (key, wave) in enumerate(reader, 1):
            if wave.duration < opts.min_duration:
                print("File: {} is too short ({} sec): producing no output.".
                      format(key, wave.duration),
                      file=sys.stderr)
                continue

            num_chan = wave.data().num_rows
            if opts.channel >= num_chan:
                print(
                    "File with id {} has {} channels but you specified "
                    "channel {}, producing no output.",
                    file=sys.stderr)
                continue
            channel = 0 if opts.channel == -1 else opts.channel

            if opts.vtln_map:
                if key not in vtln_map_reader:
                    print("No vtln-map entry for utterance-id (or speaker-id)",
                          key,
                          file=sys.stderr)
                    continue
                vtln_warp = vtln_map_reader[key]
            else:
                vtln_warp = opts.vtln_warp

            try:
                feats = mfcc.compute_features(wave.data()[channel],
                                              wave.samp_freq, vtln_warp)
            except:
                print("Failed to compute features for utterance",
                      key,
                      file=sys.stderr)
                continue

            if opts.subtract_mean:
                mean = Vector(feats.num_cols)
                mean.add_row_sum_mat_(1.0, feats)
                mean.scale_(1.0 / feats.num_rows)
                for i in range(feats.num_rows):
                    feats[i].add_vec_(-1.0, mean)

            writer[key] = feats
            num_success += 1

            if num_utts % 10 == 0:
                print("Processed {} utterances".format(num_utts),
                      file=sys.stderr)

    print("Done {} out of {} utterances".format(num_success, num_utts),
          file=sys.stderr)

    if opts.vtln_map:
        vtln_map_reader.close()

    return num_success != 0
Esempio n. 8
0
                voice_feats.row(index).copy_row_from_mat_(feats, i)
                index += 1

        LOG.debug('Feats extract successed')
        return voice_feats

    return feat_pipeline


mfcc_opts = MfccOptions()
mfcc_opts.frame_opts.samp_freq = 16000
mfcc_opts.frame_opts.allow_downsample = True
mfcc_opts.mel_opts.num_bins = 40
mfcc_opts.num_ceps = 20
mfcc_opts.use_energy = True
mfcc = Mfcc(mfcc_opts)

sliding_opts = SlidingWindowCmnOptions()
sliding_opts.cmn_window = 300
sliding_opts.normalize_variance = False
sliding_opts.center = True

vad_opts = VadEnergyOptions()
vad_opts.vad_energy_threshold = 5.5
vad_opts.vad_energy_mean_scale = 0.5

delta_opts = DeltaFeaturesOptions()
delta_opts.window = 3
delta_opts.order = 2

feat_pipeline = make_feat_pipeline(mfcc, sliding_opts, vad_opts, delta_opts)
def compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts):
    mfcc = Mfcc(mfcc_opts)

    # Shift by label window length so that feats align
    lab_window_len_sample = int(
        (opts.sampling_rate * opts.label_window_length) / 1000)
    lab_window_shift_sample = int(
        (opts.sampling_rate * opts.label_window_shift) / 1000)
    sig_window_len_sample = int(
        (opts.sampling_rate * opts.signal_window_length) / 1000)

    num_utts, num_success = 0, 0
    with SequentialWaveReader(wav_rspecifier) as reader, \
         MatrixWriter(feats_wspecifier) as writer:
        for num_utts, (key, wave) in enumerate(reader, 1):
            if wave.duration < opts.min_duration:
                print("File: {} is too short ({} sec): producing no output.".
                      format(key, wave.duration),
                      file=sys.stderr)
                continue

            num_chan = wave.data().num_rows
            if opts.channel >= num_chan:
                print(
                    "File with id {} has {} channels but you specified "
                    "channel {}, producing no output.",
                    file=sys.stderr)
                continue
            channel = 0 if opts.channel == -1 else opts.channel

            try:
                # Move signal from integers to floats
                signal = wave.data()[channel].numpy()
                signal = signal.astype(float) / 2**15  # 32768  # int to float
                signal /= np.max(np.abs(signal))  # normalise

                # Extract windows
                feats = extract_windows(signal, sig_window_len_sample,
                                        lab_window_len_sample,
                                        lab_window_shift_sample)
            except:
                print("Failed to compute features for utterance",
                      key,
                      file=sys.stderr)
                continue

            if opts.subtract_mean:
                mean = Vector(feats.num_cols)
                mean.add_row_sum_mat_(1.0, feats)
                mean.scale_(1.0 / feats.num_rows)
                for i in range(feats.num_rows):
                    feats[i].add_vec_(-1.0, mean)

            writer[key] = feats
            num_success += 1

            if num_utts % 10 == 0:
                print("Processed {} utterances".format(num_utts),
                      file=sys.stderr)

    print("Done {} out of {} utterances".format(num_success, num_utts),
          file=sys.stderr)

    return num_success != 0
Esempio n. 10
0
from kaldi.gmm.am import AmDiagGmm, DecodableAmDiagGmmScaled
from kaldi.hmm import TransitionModel
from kaldi.util.io import xopen
from kaldi.util.table import SequentialWaveReader


# Define the feature pipeline: (wav) -> feats
def make_feat_pipeline(base, opts=DeltaFeaturesOptions()):
    def feat_pipeline(wav):
        feats = base.compute_features(wav.data()[0], wav.samp_freq, 1.0)
        return compute_deltas(opts, feats)

    return feat_pipeline


feat_pipeline = make_feat_pipeline(Mfcc(MfccOptions()))

# Read the model
with xopen("/home/dogan/tools/pykaldi/egs/models/wsj/final.mdl") as ki:
    trans_model = TransitionModel().read(ki.stream(), ki.binary)
    acoustic_model = AmDiagGmm().read(ki.stream(), ki.binary)


# Define the decodable wrapper: (features, acoustic_scale) -> decodable
def make_decodable_wrapper(trans_model, acoustic_model):
    def decodable_wrapper(features, acoustic_scale):
        return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features,
                                        acoustic_scale)

    return decodable_wrapper