def test_load(): s, r = paddleaudio.load(TEST_FILE, sr=16000) assert r == 16000 assert s.dtype == 'float32' s, r = paddleaudio.load(TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16') assert len(s) / r == 2.0 assert r == 16000 assert s.dtype == 'int16'
def _load(self, file): if self.data_type == 'wav': s, _ = paddleaudio.load(file, sr=c['sample_rate']) s = np.pad(s, ((0, 1), (0, 0)), 'constant', constant_values=(0, )) power = (np.exp(s) - 1)**2 power = np.abs(s)**2 melW = librosa.filters.mel(sr=c['sample_rate'], n_fft=c['window_size'], n_mels=c['mel_bins'], fmin=c['fmin'], fmax=c['fmax']) mel = np.matmul(melW, power) x = librosa.power_to_db(mel, ref=1.0, amin=1e-10, top_db=None) else: s = np.load(file) print(s.shape) x = s if self.training: x = self._pad(x, c['mel_crop_len']) if self.augment: x = augmentation.random_crop2d(x, c['mel_crop_len'], tempo_axis=1) x = augmentation.spect_augment(x, tempo_axis=1) else: #use all data for evaluation x = self._pad(x, c['max_mel_len']) x = x[:, :c['max_mel_len']] return x.T
def load_audio(file): """Load audio from local path The function will resample the audio to 16K and re-normalize it to have zero-mean and unit-variance """ s, _ = paddleaudio.load(file, sr=16000, normal=True, norm_type='gaussian') x = paddle.to_tensor(s) x = x.unsqueeze(0) return x
def get_feature(file, model, melspectrogram, random_sampling=False): global file2feature if file in file2feature: return file2feature[file] s0, _ = paddleaudio.load(file, sr=16000) #, norm_type='gaussian') s = paddle.to_tensor(s0[None, :]) s = melspectrogram(s).astype('float32') with paddle.no_grad(): feature = model(s) #.squeeze() feature = feature / paddle.sqrt(paddle.sum(feature**2)) file2feature.update({file: feature}) return feature
def load_and_extract_feature(file): s, r = pa.load(file, sr=c['sample_rate']) x = pa.features.mel_spect(s, sample_rate=c['sample_rate'], window_size=c['window_size'], hop_length=c['hop_size'], mel_bins=c['mel_bins'], fmin=c['fmin'], fmax=c['fmax'], window='hann', center=True, pad_mode='reflect', ref=1.0, amin=1e-10, top_db=None) x = x.T #!! x = paddle.Tensor(x).unsqueeze((0, 1)) return x
def load_and_extract_feature(file, c): s, _ = pa.load(file, sr=c['sample_rate']) x = melspectrogram(paddle.to_tensor(s), sr=c['sample_rate'], win_length=c['window_size'], n_fft=c['window_size'], hop_length=c['hop_size'], n_mels=c['mel_bins'], f_min=c['fmin'], f_max=c['fmax'], window='hann', center=True, pad_mode='reflect', to_db=True, amin=1e-3, top_db=None) x = x.transpose((0, 2, 1)) x = x.unsqueeze((0, )) return x
def test_case(sr, n_fft, hop_length, win_length, window, center, pad_mode, power, n_mels, f_min, f_max, dtype, device): paddle.set_device(device) signal, sr = paddleaudio.load('./test/unit_test/test_audio.wav') signal_tensor = paddle.to_tensor(signal) paddle_cpu_feat = paddleaudio.functional.melspectrogram( signal_tensor, sr=16000, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, center=center, n_mels=n_mels, pad_mode=pad_mode, f_min=f_min, f_max=f_max, htk=True, norm='slaney', dtype=dtype) librosa_feat = librosa.feature.melspectrogram(signal, sr=16000, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, center=center, n_mels=n_mels, pad_mode=pad_mode, power=2.0, norm='slaney', htk=True, fmin=f_min, fmax=f_max) err = np.mean(np.abs(librosa_feat - paddle_cpu_feat.numpy())) if dtype == 'float64': assert err < 1.0e-07 else: assert err < 5.0e-07
def __getitem__(self, idx): idx = idx % len(self.keys) key = self.keys[idx] spk = key.split('-')[0] cls_idx = self.spk2cls[spk] file = self.key2file[key] file_duration = None if not self.augment and self.duration: file_duration = self.duration while True: try: wav, sr = paddleaudio.load(file, sr=self.sample_rate, duration=file_duration) break except: key = self.keys[idx] spk = key.split('-')[0] #spk = self.speakers[idx] cls_idx = self.spk2cls[spk] file = self.key2file[key] print(f'error loading file {file}') speed = random.choice([0, 1, 2]) if speed == 1: wav = paddleaudio.resample(wav, 16000, 16000 * 0.9) cls_idx = cls_idx * 3 + 1 elif speed == 2: wav = paddleaudio.resample(wav, 16000, 16000 * 1.1) cls_idx = cls_idx * 3 + 2 else: cls_idx = cls_idx * 3 if self.augment: wav = augments.random_crop_or_pad1d(wav, self.duration) elif self.duration: wav = augments.center_crop_or_pad1d(wav, self.duration) return wav, cls_idx
import paddleaudio if __name__ == '__main__': paddleaudio.load('./test_audio.m4a')
amin=1e-10, top_db=None) dst_h5.create_dataset(key, data=x) src_h5.close() dst_h5.close() if len(wav_files) > 0: assert args.dst_h5_file != '', 'for using wav file or wav list, dst_h5_file must be specified' dst_file = args.dst_h5_file assert not os.path.exists(dst_file), f'target file {dst_file} existed' dst_h5 = h5py.File(dst_file, "w") print(f'{len(wav_files)} wav files listed') for f in tqdm.tqdm(wav_files): s, _ = pa.load(f, sr=args.sample_rate) x = pa.melspectrogram(s, sr=args.sample_rate, window_size=args.window_size, hop_length=args.hop_length, n_mels=args.mel_bins, fmin=args.fmin, fmax=args.fmax, window='hann', center=True, pad_mode='reflect', ref=1.0, amin=1e-10, top_db=None) # figure(figsize=(8,8)) # imshow(x)