def find_queries(query_dir_path): """Find all queries under sws2013_dev & sws2013_eval.""" # e.g. "sws2013_dev_123.wav" or "sws2013_dev_123_01.wav" -> "sws2013_dev_123" pattern = re.compile(r"(_[0-9]{2})?\.wav") query2tensors = defaultdict(list) for query_path in tqdm(list(query_dir_path.glob("*.wav")), ncols=0, desc="Load queries"): query_name = pattern.sub("", query_path.name) wav_tensor, sample_rate = apply_effects_file( str(query_path), [["channels", "1"], ["rate", "16000"], ["norm"]]) trimmed, _ = apply_effects_tensor( wav_tensor, sample_rate, [ ["vad", "-T", "0.25", "-p", "0.1"], ["reverse"], ["vad", "-T", "0.25", "-p", "0.1"], ["reverse"], ], ) wav_tensor = trimmed if trimmed.size(1) >= (sample_rate * 0.5) else wav_tensor wav_tensor = wav_tensor.squeeze(0) query2tensors[query_name].append(wav_tensor) return query2tensors
def test_apply_effects(self, args): """`apply_effects_tensor` should return identical data as sox command""" effects = args['effects'] num_channels = args.get("num_channels", 2) input_sr = args.get("input_sample_rate", 8000) output_sr = args.get("output_sample_rate") input_path = self.get_temp_path('input.wav') reference_path = self.get_temp_path('reference.wav') original = get_sinusoid(frequency=800, sample_rate=input_sr, n_channels=num_channels, dtype='float32') save_wav(input_path, original, input_sr) sox_utils.run_sox_effect(input_path, reference_path, effects, output_sample_rate=output_sr) expected, expected_sr = load_wav(reference_path) found, sr = sox_effects.apply_effects_tensor(original, input_sr, effects) assert sr == expected_sr self.assertEqual(expected, found)
def test_apply_effects_tensor(self, args): """`apply_effects_tensor` should not crash""" effects = args['effects'] num_channels = args.get("num_channels", 2) input_sr = args.get("input_sample_rate", 8000) original = get_sinusoid( frequency=800, sample_rate=input_sr, n_channels=num_channels, dtype='float32') _found, _sr = sox_effects.apply_effects_tensor(original, input_sr, effects)
def extract_embedding(audio_path, embedder): wav, sr = torchaudio.load(audio_path) # 2D if sr != embedder.RATE: wav, sr = ta_sox.apply_effects_tensor( wav, sr, [["rate", str(embedder.RATE)]]) try: emb = embedder([wav[0].cuda().float()]).cpu().numpy() except RuntimeError: emb = None return emb
def _sox_convert( waveform: torch.FloatTensor, sample_rate: int, effects: List[List[str]], ) -> torch.FloatTensor: try: import torchaudio.sox_effects as ta_sox except ImportError: raise ImportError("Please install torchaudio to convert audios") return ta_sox.apply_effects_tensor(waveform, sample_rate, effects)[0]
def _convert_to_mono(waveform: torch.FloatTensor, sample_rate: int) -> torch.FloatTensor: if waveform.shape[0] > 1: try: import torchaudio.sox_effects as ta_sox except ImportError: raise ImportError( "Please install torchaudio to convert multi-channel audios") effects = [['channels', '1']] return ta_sox.apply_effects_tensor(waveform, sample_rate, effects)[0] return waveform
def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first): """`apply_effects_tensor` without effects should return identical data as input""" original = get_wav_data(dtype, num_channels, channels_first=channels_first) expected = original.clone() found, output_sample_rate = sox_effects.apply_effects_tensor( expected, sample_rate, [], channels_first) assert output_sample_rate == sample_rate # SoxEffect should not alter the input Tensor object self.assertEqual(original, expected) # SoxEffect should not return the same Tensor object assert expected is not found # Returned Tensor should equal to the input Tensor self.assertEqual(expected, found)
def augment(self, x): speed = self.rng.uniform(1 - self.speed_range, 1 + self.speed_range) pitch = int(self.rng.uniform(-self.pitch_range, self.pitch_range)) effects = [ ['gain', '-n'], ['pitch', f"{pitch}"], ['speed', f'{speed:.2f}'], ] x, _ = apply_effects_tensor(x, self.sample_rate, effects, channels_first=True) return x
def crop_segment(tensor, tgt_dur, sample_rate=16000): src_dur = len(tensor) / sample_rate random_shift = random.uniform(0, src_dur - tgt_dur) audio_tensor, _ = apply_effects_tensor( tensor.unsqueeze(0), sample_rate, [ ["pad", f"{tgt_dur}", f"{tgt_dur}"], [ "trim", f"{tgt_dur + random_shift}", f"{tgt_dur}", ], ], ) return audio_tensor.squeeze(0)
def convert_waveform( waveform: Union[np.ndarray, torch.Tensor], sample_rate: int, normalize_volume: bool = False, to_mono: bool = False, to_sample_rate: Optional[int] = None ) -> Tuple[Union[np.ndarray, torch.Tensor], int]: """convert a waveform: - to a target sample rate - from multi-channel to mono channel - volume normalization Args: waveform (numpy.ndarray or torch.Tensor): 2D original waveform (channels x length) sample_rate (int): original sample rate normalize_volume (bool): perform volume normalization to_mono (bool): convert to mono channel if having multiple channels to_sample_rate (Optional[int]): target sample rate Returns: waveform (numpy.ndarray): converted 2D waveform (channels x length) sample_rate (float): target sample rate """ try: import torchaudio.sox_effects as ta_sox except ImportError: raise ImportError("Please install torchaudio: pip install torchaudio") effects = [] if normalize_volume: effects.append(["gain", "-n"]) if to_sample_rate is not None and to_sample_rate != sample_rate: effects.append(["rate", f"{to_sample_rate}"]) if to_mono and waveform.shape[0] > 1: effects.append(["channels", "1"]) if len(effects) > 0: is_np_input = isinstance(waveform, np.ndarray) _waveform = torch.from_numpy(waveform) if is_np_input else waveform converted, converted_sample_rate = ta_sox.apply_effects_tensor( _waveform, sample_rate, effects) if is_np_input: converted = converted.numpy() return converted, converted_sample_rate return waveform, sample_rate
def test_apply_effects_tensor(self, args): effects = args['effects'] channels_first = True num_channels = args.get("num_channels", 2) input_sr = args.get("input_sample_rate", 8000) trans = SoxEffectTensorTransform(effects, input_sr, channels_first) path = self.get_temp_path('sox_effect.zip') torch.jit.script(trans).save(path) trans = torch.jit.load(path) wav = get_sinusoid( frequency=800, sample_rate=input_sr, n_channels=num_channels, dtype='float32', channels_first=channels_first) found, sr_found = trans(wav) expected, sr_expected = sox_effects.apply_effects_tensor( wav, input_sr, effects, channels_first) assert sr_found == sr_expected self.assertEqual(expected, found)
def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]: """Get mel-filter bank features via TorchAudio.""" try: import torch import torchaudio.compliance.kaldi as ta_kaldi import torchaudio.sox_effects as ta_sox waveform = torch.from_numpy(waveform) if len(waveform.shape) == 1: # Mono channel: D -> 1 x D waveform = waveform.unsqueeze(0) else: # Merge multiple channels to one: C x D -> 1 x D waveform, _ = ta_sox.apply_effects_tensor(waveform, sample_rate, ['channels', '1']) features = ta_kaldi.fbank(waveform, num_mel_bins=n_bins, sample_frequency=sample_rate) return features.numpy() except ImportError: return None
def process_wav(wav_files, out_dir, tempo, random_tempo=False): # NOTE: must be careful about random seed if random_tempo: random.seed(int(tempo * 100)) for wav_file in tqdm(wav_files): if random_tempo: tempo = round(random.uniform(0.9, 1.1), 2) wav, sr = librosa.load(wav_file, sr=None) x = torch.from_numpy(wav).view(1, -1) # pitch shift by sox effects = [["tempo", f"{tempo}"], ["rate", f"{sr}"]] y, y_sr = apply_effects_tensor(x, sr, effects) y = y.view(-1) assert y_sr == sr postfix = str(tempo).replace("-", "minus") + "tempo_aug" out_file = join(out_dir, basename(wav_file).replace(".wav", f"_{postfix}.wav")) sf.write(out_file, y.numpy(), sr)
def forward(self, tensor: torch.Tensor): return sox_effects.apply_effects_tensor(tensor, self.sample_rate, self.effects, self.channels_first)
def forward(self, wav_tensor: torch.Tensor, sample_rate: int) -> torch.Tensor: wav_tensor, _ = apply_effects_tensor(wav_tensor, sample_rate, self.effects) return wav_tensor