def run_audio(data, audio_base, audio_path, audio_conf): if os.path.exists(audio_path): print('%s already exists. skip' % audio_path) return print('Dumping audio to HDF5 : %s' % audio_path) print(' audio_conf : %s' % audio_conf) audio_conf['num_mel_bins'] = audio_conf.get('num_mel_bins', 40) audio_conf['target_length'] = audio_conf.get('target_length', 2048) audio_conf['use_raw_length'] = audio_conf.get('use_raw_length', False) assert(not audio_conf['use_raw_length']) # dump audio n = len(data) Path(os.path.dirname(audio_path)).mkdir(parents=True, exist_ok=True) f = h5py.File(audio_path, 'w') dset_mel_shape = (n, audio_conf['num_mel_bins'], audio_conf['target_length']) dset_mel = f.create_dataset('melspec', dset_mel_shape, dtype='f') dset_len = f.create_dataset('melspec_len', (n,), dtype='i8') start = time.time() for i, d in enumerate(data): y, sr = librosa.load('%s/%s' % (audio_base, d['wav']), None) logspec, n_frames = compute_spectrogram(y, sr, audio_conf) dset_mel[i, :, :] = logspec dset_len[i] = n_frames if i % 100 == 0: t = time.time() - start print('processed %d / %d audios (%.fs)' % (i, n, t))
def load_raw_spectrogram(path): audio_conf = { 'audio_type': 'spectrogram', 'sample_rate': 16000, 'window_size': 0.005, 'window_stride': 0.002, 'n_fft': 1024, 'use_raw_length': True } y, sr = librosa.load(path, audio_conf['sample_rate']) logspec, _ = compute_spectrogram(y, sr, audio_conf) logspec = logspec.numpy() return logspec, audio_conf['window_stride'], audio_conf['sample_rate']
def load_mel_spectrogram_and_path(path): y, sr = librosa.load(path, 16000) logmelspec, n_frames = compute_spectrogram(y, sr, audio_conf) return logmelspec, n_frames, path
def _LoadAudio(self, path): y, sr = librosa.load(path, None) logspec, n_frames = compute_spectrogram(y, sr, self.audio_conf) return logspec, n_frames