Esempio n. 1
0
def find_queries(query_dir_path):
    """Find all queries under sws2013_dev & sws2013_eval."""

    # e.g. "sws2013_dev_123.wav" or "sws2013_dev_123_01.wav" -> "sws2013_dev_123"
    pattern = re.compile(r"(_[0-9]{2})?\.wav")

    query2tensors = defaultdict(list)
    for query_path in tqdm(list(query_dir_path.glob("*.wav")),
                           ncols=0,
                           desc="Load queries"):
        query_name = pattern.sub("", query_path.name)
        wav_tensor, sample_rate = apply_effects_file(
            str(query_path), [["channels", "1"], ["rate", "16000"], ["norm"]])
        trimmed, _ = apply_effects_tensor(
            wav_tensor,
            sample_rate,
            [
                ["vad", "-T", "0.25", "-p", "0.1"],
                ["reverse"],
                ["vad", "-T", "0.25", "-p", "0.1"],
                ["reverse"],
            ],
        )
        wav_tensor = trimmed if trimmed.size(1) >= (sample_rate *
                                                    0.5) else wav_tensor
        wav_tensor = wav_tensor.squeeze(0)
        query2tensors[query_name].append(wav_tensor)

    return query2tensors
Esempio n. 2
0
    def test_apply_effects(self, args):
        """`apply_effects_tensor` should return identical data as sox command"""
        effects = args['effects']
        num_channels = args.get("num_channels", 2)
        input_sr = args.get("input_sample_rate", 8000)
        output_sr = args.get("output_sample_rate")

        input_path = self.get_temp_path('input.wav')
        reference_path = self.get_temp_path('reference.wav')

        original = get_sinusoid(frequency=800,
                                sample_rate=input_sr,
                                n_channels=num_channels,
                                dtype='float32')
        save_wav(input_path, original, input_sr)
        sox_utils.run_sox_effect(input_path,
                                 reference_path,
                                 effects,
                                 output_sample_rate=output_sr)

        expected, expected_sr = load_wav(reference_path)
        found, sr = sox_effects.apply_effects_tensor(original, input_sr,
                                                     effects)

        assert sr == expected_sr
        self.assertEqual(expected, found)
Esempio n. 3
0
 def test_apply_effects_tensor(self, args):
     """`apply_effects_tensor` should not crash"""
     effects = args['effects']
     num_channels = args.get("num_channels", 2)
     input_sr = args.get("input_sample_rate", 8000)
     original = get_sinusoid(
         frequency=800, sample_rate=input_sr,
         n_channels=num_channels, dtype='float32')
     _found, _sr = sox_effects.apply_effects_tensor(original, input_sr, effects)
Esempio n. 4
0
def extract_embedding(audio_path, embedder):
    wav, sr = torchaudio.load(audio_path)  # 2D
    if sr != embedder.RATE:
        wav, sr = ta_sox.apply_effects_tensor(
            wav, sr, [["rate", str(embedder.RATE)]])
    try:
        emb = embedder([wav[0].cuda().float()]).cpu().numpy()
    except RuntimeError:
        emb = None
    return emb
Esempio n. 5
0
def _sox_convert(
    waveform: torch.FloatTensor,
    sample_rate: int,
    effects: List[List[str]],
) -> torch.FloatTensor:
    try:
        import torchaudio.sox_effects as ta_sox
    except ImportError:
        raise ImportError("Please install torchaudio to convert audios")
    return ta_sox.apply_effects_tensor(waveform, sample_rate, effects)[0]
Esempio n. 6
0
def _convert_to_mono(waveform: torch.FloatTensor,
                     sample_rate: int) -> torch.FloatTensor:
    if waveform.shape[0] > 1:
        try:
            import torchaudio.sox_effects as ta_sox
        except ImportError:
            raise ImportError(
                "Please install torchaudio to convert multi-channel audios")
        effects = [['channels', '1']]
        return ta_sox.apply_effects_tensor(waveform, sample_rate, effects)[0]
    return waveform
Esempio n. 7
0
    def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first):
        """`apply_effects_tensor` without effects should return identical data as input"""
        original = get_wav_data(dtype, num_channels, channels_first=channels_first)
        expected = original.clone()
        found, output_sample_rate = sox_effects.apply_effects_tensor(
            expected, sample_rate, [], channels_first)

        assert output_sample_rate == sample_rate
        # SoxEffect should not alter the input Tensor object
        self.assertEqual(original, expected)
        # SoxEffect should not return the same Tensor object
        assert expected is not found
        # Returned Tensor should equal to the input Tensor
        self.assertEqual(expected, found)
Esempio n. 8
0
    def augment(self, x):
        speed = self.rng.uniform(1 - self.speed_range, 1 + self.speed_range)
        pitch = int(self.rng.uniform(-self.pitch_range, self.pitch_range))
        effects = [
            ['gain', '-n'],
            ['pitch', f"{pitch}"],
            ['speed', f'{speed:.2f}'],
        ]

        x, _ = apply_effects_tensor(x,
                                    self.sample_rate,
                                    effects,
                                    channels_first=True)

        return x
Esempio n. 9
0
def crop_segment(tensor, tgt_dur, sample_rate=16000):
    src_dur = len(tensor) / sample_rate
    random_shift = random.uniform(0, src_dur - tgt_dur)
    audio_tensor, _ = apply_effects_tensor(
        tensor.unsqueeze(0),
        sample_rate,
        [
            ["pad", f"{tgt_dur}", f"{tgt_dur}"],
            [
                "trim",
                f"{tgt_dur + random_shift}",
                f"{tgt_dur}",
            ],
        ],
    )
    return audio_tensor.squeeze(0)
Esempio n. 10
0
def convert_waveform(
    waveform: Union[np.ndarray, torch.Tensor],
    sample_rate: int,
    normalize_volume: bool = False,
    to_mono: bool = False,
    to_sample_rate: Optional[int] = None
) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
    """convert a waveform:
        - to a target sample rate
        - from multi-channel to mono channel
        - volume normalization

        Args:
            waveform (numpy.ndarray or torch.Tensor): 2D original waveform
                (channels x length)
            sample_rate (int): original sample rate
            normalize_volume (bool): perform volume normalization
            to_mono (bool): convert to mono channel if having multiple channels
            to_sample_rate (Optional[int]): target sample rate
        Returns:
            waveform (numpy.ndarray): converted 2D waveform (channels x length)
            sample_rate (float): target sample rate
        """
    try:
        import torchaudio.sox_effects as ta_sox
    except ImportError:
        raise ImportError("Please install torchaudio: pip install torchaudio")

    effects = []
    if normalize_volume:
        effects.append(["gain", "-n"])
    if to_sample_rate is not None and to_sample_rate != sample_rate:
        effects.append(["rate", f"{to_sample_rate}"])
    if to_mono and waveform.shape[0] > 1:
        effects.append(["channels", "1"])
    if len(effects) > 0:
        is_np_input = isinstance(waveform, np.ndarray)
        _waveform = torch.from_numpy(waveform) if is_np_input else waveform
        converted, converted_sample_rate = ta_sox.apply_effects_tensor(
            _waveform, sample_rate, effects)
        if is_np_input:
            converted = converted.numpy()
        return converted, converted_sample_rate
    return waveform, sample_rate
Esempio n. 11
0
    def test_apply_effects_tensor(self, args):
        effects = args['effects']
        channels_first = True
        num_channels = args.get("num_channels", 2)
        input_sr = args.get("input_sample_rate", 8000)

        trans = SoxEffectTensorTransform(effects, input_sr, channels_first)

        path = self.get_temp_path('sox_effect.zip')
        torch.jit.script(trans).save(path)
        trans = torch.jit.load(path)

        wav = get_sinusoid(
            frequency=800, sample_rate=input_sr,
            n_channels=num_channels, dtype='float32', channels_first=channels_first)
        found, sr_found = trans(wav)
        expected, sr_expected = sox_effects.apply_effects_tensor(
            wav, input_sr, effects, channels_first)

        assert sr_found == sr_expected
        self.assertEqual(expected, found)
Esempio n. 12
0
def _get_torchaudio_fbank(waveform,
                          sample_rate,
                          n_bins=80) -> Optional[np.ndarray]:
    """Get mel-filter bank features via TorchAudio."""
    try:
        import torch
        import torchaudio.compliance.kaldi as ta_kaldi
        import torchaudio.sox_effects as ta_sox

        waveform = torch.from_numpy(waveform)
        if len(waveform.shape) == 1:
            # Mono channel: D -> 1 x D
            waveform = waveform.unsqueeze(0)
        else:
            # Merge multiple channels to one: C x D -> 1 x D
            waveform, _ = ta_sox.apply_effects_tensor(waveform, sample_rate,
                                                      ['channels', '1'])

        features = ta_kaldi.fbank(waveform,
                                  num_mel_bins=n_bins,
                                  sample_frequency=sample_rate)
        return features.numpy()
    except ImportError:
        return None
Esempio n. 13
0
def process_wav(wav_files, out_dir, tempo, random_tempo=False):
    # NOTE: must be careful about random seed
    if random_tempo:
        random.seed(int(tempo * 100))

    for wav_file in tqdm(wav_files):
        if random_tempo:
            tempo = round(random.uniform(0.9, 1.1), 2)

        wav, sr = librosa.load(wav_file, sr=None)
        x = torch.from_numpy(wav).view(1, -1)

        # pitch shift by sox
        effects = [["tempo", f"{tempo}"], ["rate", f"{sr}"]]
        y, y_sr = apply_effects_tensor(x, sr, effects)
        y = y.view(-1)

        assert y_sr == sr

        postfix = str(tempo).replace("-", "minus") + "tempo_aug"

        out_file = join(out_dir,
                        basename(wav_file).replace(".wav", f"_{postfix}.wav"))
        sf.write(out_file, y.numpy(), sr)
Esempio n. 14
0
 def forward(self, tensor: torch.Tensor):
     return sox_effects.apply_effects_tensor(tensor, self.sample_rate,
                                             self.effects,
                                             self.channels_first)
Esempio n. 15
0
 def forward(self, wav_tensor: torch.Tensor,
             sample_rate: int) -> torch.Tensor:
     wav_tensor, _ = apply_effects_tensor(wav_tensor, sample_rate,
                                          self.effects)
     return wav_tensor