Ejemplo n.º 1
0
def convert(catalog, dat_file, n_quant, sample_rate=16000):
    """
    Convert all input data and save a dat file 
    """
    import librosa
    if n_quant <= 2**8:
        snd_dtype = np.uint8
    elif n_quant <= 2**15:
        snd_dtype = np.int16
    else:
        snd_dtype = np.int32

    n_mel_chan = None
    speaker_ids = set(id for id, __ in catalog)
    speaker_id_map = dict((v, k) for k, v in enumerate(speaker_ids))
    snd_data = np.empty((0), dtype=snd_dtype)
    samples = []

    for (voice_id, snd_path) in catalog:
        snd, _ = librosa.load(snd_path, sample_rate)
        snd_mu = util.mu_encode_np(snd, n_quant).astype(snd_dtype)
        wav_b = len(snd_data)
        wav_e = wav_b + len(snd_mu)
        snd_data.resize(wav_e)
        snd_data[wav_b:wav_e] = snd_mu
        samples.append(
            SpokenSample(
                voice_index=speaker_id_map[voice_id],
                wav_b=wav_b,
                wav_e=wav_e,
                # mel_b=mel_b, mel_e=mel_e,
                file_path=snd_path))
        if len(samples) % 100 == 0:
            print('Converted {} files of {}.'.format(len(samples),
                                                     len(catalog),
                                                     file=stderr))
            stderr.flush()

    with open(dat_file, 'wb') as dat_fh:
        state = {
            'samples': samples,
            'snd_dtype': snd_dtype,
            'snd_data': snd_data
        }
        pickle.dump(state, dat_fh)
Ejemplo n.º 2
0
def convert(catalog,
            pfx,
            n_quant,
            sample_rate=16000,
            win_sz=400,
            hop_sz=160,
            n_mels=80,
            n_mfcc=13):

    mfcc_proc = mfcc.ProcessWav(sample_rate, win_sz, hop_sz, n_mels, n_mfcc)

    if n_quant <= 2**8:
        snd_dtype = np.uint8
    elif n_quant <= 2**15:
        snd_dtype = np.int16
    else:
        snd_dtype = np.int32

    snd_file = pfx + '.dat'
    ind_file = pfx + '.ind'
    mel_file = pfx + '.mel'
    ind = {'voice_id': [], 'n_snd_elem': [], 'n_mel_elem': [], 'snd_path': []}
    n_snd_elem = 0
    n_mel_elem = 0
    n_mel_chan = None

    with open(snd_file, 'wb') as snd_fh, open(mel_file, 'wb') as mel_fh:
        for (voice_id, snd_path) in catalog:
            snd, _ = librosa.load(snd_path, sample_rate)
            snd_mu = util.mu_encode_np(snd, n_quant).astype(snd_dtype)
            # mel: C, T  (n_mels, n_timesteps)
            # reshape to T, C and flatten
            mel = mfcc_proc.func(snd)
            if n_mel_chan is None:
                n_mel_chan = mel.shape[0]

            mel = mel.transpose((1, 0)).flatten()
            snd_fh.write(snd_mu.data)
            mel_fh.write(mel.data)
            ind['voice_id'].append(voice_id)
            ind['n_snd_elem'].append(snd.size)
            ind['n_mel_elem'].append(mel.size)
            ind['snd_path'].append(snd_path)
            if len(ind['voice_id']) % 100 == 0:
                print('Converted {} files of {}.'.format(len(ind['voice_id']),
                                                         len(catalog),
                                                         file=stderr))
                stderr.flush()
            n_snd_elem += snd.size
            n_mel_elem += mel.size

    with open(ind_file, 'wb') as ind_fh:
        index = {
            'window_size': win_sz,
            'hop_size': hop_sz,
            'n_snd_elem': n_snd_elem,
            'n_mel_elem': n_mel_elem,
            'n_mel_chan': n_mel_chan,
            'snd_dtype': snd_dtype,
            'n_quant': n_quant
        }
        index.update(ind)
        pickle.dump(index, ind_fh)