Esempio n. 1
0
    def mfcc_to_ivector(self, fea):
        n_data, d_data = fea.shape

        l = 0
        lc = 0
        n = np.zeros((self.numG), dtype=np.float32)
        f = np.zeros((self.numG, self.dimF), dtype=np.float32)

        print '  Computing stats ...',
        # Note that we compute the stats in in sub-chunks due to memory optimization
        #
        seq_data = self.split_seq(range(n_data), 1000)
        for i in range(len(seq_data)):
            dd = fea[seq_data[i], :]
            l1, n1, f1 = gmm.gmm_eval(dd, self.GMM, return_accums=1)
            l = l + l1.sum()
            lc = lc + l1.shape[0]
            n = n + n1
            f = f + f1

        print '[avg llh=' + repr(l / lc) + ', #frames=' + repr(n_data) + ']'

        n, f = self.normalize_stats(n, f, self.ubm_means, self.ubm_norm)

        f = self.row(f.astype(self.v.dtype))
        n = self.row(n.astype(self.v.dtype))

        print '  Computing i-vector'
        w = iv.estimate_i(n, f, self.v, self.MVVT).T

        print "IVECTOR", w

        return w
Esempio n. 2
0
def compute_vad(s,
                win_length=160,
                win_overlap=80,
                n_realignment=5,
                threshold=0.3):
    # power signal for energy computation
    s = s**2

    # frame signal with overlap
    F = features.framing(s, win_length, win_length - win_overlap)

    # sum frames to get energy
    E = F.sum(axis=1)

    # E = np.sqrt(E)
    # E = np.log(E)

    # normalize the energy
    E -= E.mean()
    E /= E.std()

    # initialization
    mm = np.array((-1.00, 0.00, 1.00))[:, np.newaxis]
    ee = np.array((1.00, 1.00, 1.00))[:, np.newaxis]
    ww = np.array((0.33, 0.33, 0.33))

    GMM = gmm.gmm_eval_prep(ww, mm, ee)

    E = E[:, np.newaxis]

    for i in xrange(n_realignment):
        # collect GMM statistics
        llh, N, F, S = gmm.gmm_eval(E, GMM, return_accums=2)

        # update model
        ww, mm, ee = gmm.gmm_update(N, F, S)

        # wrap model
        GMM = gmm.gmm_eval_prep(ww, mm, ee)

    # evaluate the gmm llhs
    llhs = gmm.gmm_llhs(E, GMM)

    llh = gmm.logsumexp(llhs, axis=1)[:, np.newaxis]

    llhs = np.exp(llhs - llh)

    out = np.zeros(llhs.shape[0], dtype=np.bool)
    out[llhs[:, 0] < threshold] = True

    return out
Esempio n. 3
0
def compute_vad(s,
                win_length=200,
                win_overlap=120,
                n_realignment=5,
                threshold=0.3):
    import gmm
    # power signal for energy computation
    s = s**2
    # frame signal with overlap
    F = framing(s, win_length, win_length - win_overlap)
    # sum frames to get energy
    E = F.sum(axis=1).astype(np.float64)
    # E = np.sqrt(E)
    # E = np.log(E)

    # normalize the energy
    E -= E.mean()
    try:
        E /= E.std()
        # initialization
        mm = np.array((-1.00, 0.00, 1.00))[:, np.newaxis]
        ee = np.array((1.00, 1.00, 1.00))[:, np.newaxis]
        ww = np.array((0.33, 0.33, 0.33))

        GMM = gmm.gmm_eval_prep(ww, mm, ee)

        E = E[:, np.newaxis]

        for i in range(n_realignment):
            # collect GMM statistics
            llh, N, F, S = gmm.gmm_eval(E, GMM, return_accums=2)

            # update model
            ww, mm, ee = gmm.gmm_update(N, F, S)
            # wrap model
            GMM = gmm.gmm_eval_prep(ww, mm, ee)

    # evaluate the gmm llhs
        llhs = gmm.gmm_llhs(E, GMM)

        llh = gmm.logsumexp(llhs, axis=1)[:, np.newaxis]

        llhs = np.exp(llhs - llh)

        out = np.zeros(llhs.shape[0], dtype=np.bool)
        out[llhs[:, 0] < threshold] = True
    except RuntimeWarning:
        logging.info("File contains only silence")
        out = np.zeros(E.shape[0], dtype=np.bool)

    return out
Esempio n. 4
0
def main(argv):
    fbank_mx = features.mel_fbank_mx(winlen_nfft=WINDOWSIZE / SOURCERATE,
                                     fs=fs,
                                     NUMCHANS=NUMCHANS,
                                     LOFREQ=LOFREQ,
                                     HIFREQ=HIFREQ)

    scp_list = sys.argv[1]
    vad_dir = sys.argv[2]
    wav_dir = sys.argv[3]
    ubm_file = sys.argv[4]
    v_file = sys.argv[5]
    out_dir = sys.argv[6]

    print 'Loading UBM from', ubm_file
    ubm_weights, ubm_means, ubm_covs = load_ubm(ubm_file)
    GMM = gmm.gmm_eval_prep(ubm_weights, ubm_means, ubm_covs)

    numG = ubm_means.shape[0]
    dimF = ubm_means.shape[1]

    # normalization of statistics - precomputing matrices
    if ubm_covs.shape[1] == dimF:
        ubm_norm = 1 / np.sqrt(ubm_covs);

    print 'Loading T matrix from ', v_file, '...'
    v = np.loadtxt(v_file, dtype=np.float32)

    print 'Computing MVVT ...'
    MVVT = iv.compute_VtV(v, numG)

    print 'Loading list of files to process from ' + scp_list
    seg_list = np.atleast_1d(np.loadtxt(scp_list, dtype=object))

    # extract all sub-dir names
    for dir in set(map(os.path.dirname, seg_list)):
        mkdir_p(out_dir + '/' + dir)

    # go over the scp and process the audio files
    for ii, fn in enumerate(seg_list, 1):
        try:
            print 'Processing ', ii, '/', len(seg_list), fn
            np.random.seed(777)

            wav_file = wav_dir + '/' + fn + '.wav'
            raw_file = wav_dir + '/' + fn + '.raw'
            lab_file = vad_dir + '/' + fn + '.lab.gz'
            ivec_out_file = out_dir + '/' + fn + '.ivec'

            if os.path.isfile(wav_file):
                print '  Reading wave file from ' + wav_file,
                rate, sig = spiowav.read(wav_file)

                if rate != 8000:
                    raise Exception(
                        'The input file ' + wav_file + ' is expected to be in 8000 Hz sampling rate, but ' + repr(
                            rate) + ' Hz detected')

            else:
                print '  Reading raw 8000Hz, 16bit-s, 1c,  file from ' + raw_file,
                sig = np.fromfile(raw_file, dtype='int16')

            print '[t=' + repr(len(sig) / fs) + ' seconds, fs=' + repr(fs) + 'Hz, n=' + repr(len(sig)) + ' samples]'

            if ADDDITHER > 0.0:
                print '  Adding dither'
                sig = features.add_dither(sig, ADDDITHER)

            print '  Extracting features',
            fea = features.mfcc_htk(sig,
                                    window=WINDOWSIZE / SOURCERATE,
                                    noverlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE,
                                    fbank_mx=fbank_mx,
                                    _0='first',
                                    NUMCEPS=NUMCEPS,
                                    RAWENERGY=RAWENERGY,
                                    PREEMCOEF=PREEMCOEF,
                                    CEPLIFTER=CEPLIFTER,
                                    ZMEANSOURCE=ZMEANSOURCE,
                                    ENORMALISE=ENORMALISE,
                                    ESCALE=0.1,
                                    SILFLOOR=50.0,
                                    USEHAMMING=True)

            print '[n=' + repr(len(fea)) + ' frames]'

            print '  Adding derivatives'
            # [add_deriv] step
            fea = features.add_deriv(fea, (deltawindow, accwindow))

            print '  Reshaping to SFeaCat convention'
            # [reshape] step
            fea = fea.reshape(fea.shape[0], 3, -1).transpose((0, 2, 1)).reshape(fea.shape[0],
                                                                                -1)  # re-order coeffs like SFeaCut

            if vad_dir == "auto":
                print '  Computing VAD '
                vad, n_regions, n_frames = compute_vad(sig, win_length=WINDOWSIZE / SOURCERATE,
                                                       win_overlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE)[:len(fea)]
            else:
                print '  Loading VAD definition from ' + lab_file
                vad, n_regions, n_frames = load_vad_lab_as_bool_vec(lab_file)[:len(fea)]

            print '  Applying VAD [#frames=' + repr(n_frames) + ', #regions=' + repr(n_regions) + ']'
            fea = fea[vad, ...]

            if len(fea) < 3:
                raise NoVadException('Too few frames left: ' + str(len(fea)))

            print '  Applying floating CMVN'
            fea = features.cmvn_floating(fea, cmvn_lc, cmvn_rc, unbiased=True)

            n_data, d_data = fea.shape

            l = 0;
            lc = 0
            n = np.zeros((numG), dtype=np.float32)
            f = np.zeros((numG, dimF), dtype=np.float32)

            print '  Computing stats ...',
            # Note that we compute the stats in in sub-chunks due to memory optimization
            #
            seq_data = split_seq(range(n_data), 1000)
            for i in range(len(seq_data)):
                dd = fea[seq_data[i], :]
                l1, n1, f1 = gmm.gmm_eval(dd, GMM, return_accums=1)
                l = l + l1.sum()
                lc = lc + l1.shape[0]
                n = n + n1;
                f = f + f1;

            print '[avg llh=' + repr(l / lc) + ', #frames=' + repr(n_data) + ']'

            n, f = normalize_stats(n, f, ubm_means, ubm_norm)

            f = row(f.astype(v.dtype))
            n = row(n.astype(v.dtype))

            print '  Computing i-vector'
            w = iv.estimate_i(n, f, v, MVVT).T

            # write it to the disk
            print '  Saving ivec to:', ivec_out_file
            # np.savetxt(ivec_out_file, w.ravel(), newline=' ', fmt='%f')
            ivio.write_binary_ivector(ivec_out_file, w.ravel(), n_data / 100.0)

        except NoVadException as e:
            print e
            print "Warning: No features generated for segment: " + fn

        except:
            raise
Esempio n. 5
0
        fea = features.cmvn_floating(fea, cmvn_lc, cmvn_rc, unbiased=True)

        n_data, d_data = fea.shape

        l = 0
        lc = 0
        n = np.zeros((numG), dtype=np.float32)
        f = np.zeros((numG, dimF), dtype=np.float32)

        print '  Computing stats ...',
        # Note that we compute the stats in in sub-chunks due to memory optimization
        #
        seq_data = split_seq(range(n_data), 1000)
        for i in range(len(seq_data)):
            dd = fea[seq_data[i], :]
            l1, n1, f1 = gmm.gmm_eval(dd, GMM, return_accums=1)
            l = l + l1.sum()
            lc = lc + l1.shape[0]
            n = n + n1
            f = f + f1

        print '[avg llh=' + repr(l / lc) + ', #frames=' + repr(n_data) + ']'

        n, f = normalize_stats(n, f, ubm_means, ubm_norm)

        f = row(f.astype(v.dtype))
        n = row(n.astype(v.dtype))

        print '  Computing i-vector'
        w = iv.estimate_i(n, f, v, MVVT).T
Esempio n. 6
0
    def process_wav(self, wav_file, mode="ivector", vad_dir="auto"):
        if mode not in ["ivector", "statistics", "mfcc"]:
            return False

        else:
            # all constans are initialized in __init__() method
            # READ WAVE AND COMPUTE IVECTOR
            sig, rate = librosa.load(wav_file)
            #print(librosa.get_duration(sig, rate))
            # wav conversion
            sig, rate = self.wav_conversion(sig, rate)
            #import sounddevice as sd
            #sd.play(sig, rate)

            if rate != 8000:
                raise Exception(
                    'The input file ' + wav_file +
                    ' is expected to be in 8000 Hz sampling rate, but ' +
                    repr(rate) + ' Hz detected')

            # info about singnal printed
            print '[t=' + repr(
                len(sig) / fs) + ' seconds, fs=' + repr(fs) + 'Hz, n=' + repr(
                    len(sig)) + ' samples]'

            if ADDDITHER > 0.0:
                print '  Adding dither'
                sig = features.add_dither(sig, ADDDITHER)

            print '  Extracting features',
            fea = features.mfcc_htk(sig,
                                    window=WINDOWSIZE / SOURCERATE,
                                    noverlap=(WINDOWSIZE - TARGETRATE) /
                                    SOURCERATE,
                                    fbank_mx=fbank_mx,
                                    _0='first',
                                    NUMCEPS=NUMCEPS,
                                    RAWENERGY=RAWENERGY,
                                    PREEMCOEF=PREEMCOEF,
                                    CEPLIFTER=CEPLIFTER,
                                    ZMEANSOURCE=ZMEANSOURCE,
                                    ENORMALISE=ENORMALISE,
                                    ESCALE=0.1,
                                    SILFLOOR=50.0,
                                    USEHAMMING=True)

            print '[n=' + repr(len(fea)) + ' frames]'

            print '  Adding derivatives'
            # [add_deriv] step
            fea = features.add_deriv(fea, (deltawindow, accwindow))

            print '  Reshaping to SFeaCat convention'
            # [reshape] step
            fea = fea.reshape(fea.shape[0], 3, -1).transpose(
                (0, 2, 1)).reshape(fea.shape[0],
                                   -1)  # re-order coeffs like SFeaCut
            if vad_dir == "auto":
                print '  Computing VAD '
                vad, n_regions, n_frames = self.compute_vad(
                    sig,
                    win_length=WINDOWSIZE / SOURCERATE,
                    win_overlap=(WINDOWSIZE - TARGETRATE) /
                    SOURCERATE)[:len(fea)]

                print '  Applying VAD [#frames=' + repr(
                    n_frames) + ', #regions=' + repr(n_regions) + ']'
                fea = fea[0:len(vad), ...]
                fea = fea[vad, ...]

                if len(fea) < 3:
                    raise NoVadException('Too few frames left: ' +
                                         str(len(fea)))

                print '  Applying floating CMVN'
                fea = features.cmvn_floating(fea,
                                             cmvn_lc,
                                             cmvn_rc,
                                             unbiased=True)

                if mode == "mfcc":
                    return fea

                n_data, d_data = fea.shape

                l = 0
                lc = 0
                n = np.zeros((self.numG), dtype=np.float32)
                f = np.zeros((self.numG, self.dimF), dtype=np.float32)

                print '  Computing stats ...',
                # Note that we compute the stats in in sub-chunks due to memory optimization
                #
                seq_data = self.split_seq(range(n_data), 1000)
                for i in range(len(seq_data)):
                    dd = fea[seq_data[i], :]
                    l1, n1, f1 = gmm.gmm_eval(dd, self.GMM, return_accums=1)
                    l = l + l1.sum()
                    lc = lc + l1.shape[0]
                    n = n + n1
                    f = f + f1

                print '[avg llh=' + repr(
                    l / lc) + ', #frames=' + repr(n_data) + ']'

                n, f = self.normalize_stats(n, f, self.ubm_means,
                                            self.ubm_norm)

                f = self.row(f.astype(self.v.dtype))
                n = self.row(n.astype(self.v.dtype))

                if mode == "statistics":
                    return f, n

                print '  Computing i-vector'
                w = iv.estimate_i(n, f, self.v, self.MVVT).T

                print "IVECTOR", w

                if mode == "ivector":
                    return w