Exemple #1
0
def benchmark_resample(
    method,
    waveform,
    sample_rate,
    resample_rate,
    lowpass_filter_width=DEFAULT_LOWPASS_FILTER_WIDTH,
    rolloff=DEFAULT_ROLLOFF,
    resampling_method=DEFAULT_RESAMPLING_METHOD,
    beta=None,
    librosa_type=None,
    iters=5
):
  if method == "functional":
    begin = time.time()
    for _ in range(iters):
      F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width,
                 rolloff=rolloff, resampling_method=resampling_method)
    elapsed = time.time() - begin
    return elapsed / iters
  elif method == "transforms":
    resampler = T.Resample(sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width,
                           rolloff=rolloff, resampling_method=resampling_method, dtype=waveform.dtype)
    begin = time.time()
    for _ in range(iters):
      resampler(waveform)
    elapsed = time.time() - begin
    return elapsed / iters
  elif method == "librosa":
    waveform_np = waveform.squeeze().numpy()
    begin = time.time()
    for _ in range(iters):
      librosa.resample(waveform_np, sample_rate, resample_rate, res_type=librosa_type)
    elapsed = time.time() - begin
    return elapsed / iters
Exemple #2
0
    def test_resample_no_warning(self):
        sample_rate = 44100
        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.1)

        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            F.resample(waveform, float(sample_rate), sample_rate / 2.)
        assert len(w) == 0
Exemple #3
0
    def test_resample_warning(self):
        """resample should throw a warning if an input frequency is not of an integer value"""
        sample_rate = 44100
        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.1)

        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            F.resample(waveform, sample_rate, 5512.5)
        assert len(w) == 1
 def func_beta(tensor):
     sr1, sr2 = 16000., 8000.
     beta = 6.
     return F.resample(tensor,
                       sr1,
                       sr2,
                       resampling_method="kaiser_window",
                       beta=beta)
    def test_resample(self):
        input_path = common_utils.get_asset_path('sinewave.wav')
        waveform, sample_rate = common_utils.load_wav(input_path)

        upsample_rate = sample_rate * 2
        downsample_rate = sample_rate // 2

        ta_upsampled = F.resample(waveform, sample_rate, upsample_rate)
        lr_upsampled = librosa.resample(
            waveform.squeeze(0).numpy(), sample_rate, upsample_rate)
        lr_upsampled = torch.from_numpy(lr_upsampled).unsqueeze(0)

        self.assertEqual(ta_upsampled, lr_upsampled, atol=1e-2, rtol=1e-5)

        ta_downsampled = F.resample(waveform, sample_rate, downsample_rate)
        lr_downsampled = librosa.resample(
            waveform.squeeze(0).numpy(), sample_rate, downsample_rate)
        lr_downsampled = torch.from_numpy(lr_downsampled).unsqueeze(0)

        self.assertEqual(ta_downsampled, lr_downsampled, atol=1e-2, rtol=1e-5)
Exemple #6
0
 def test_resample_waveform_identity_size(self, resampling_method):
     sr = 16000
     waveform = get_whitenoise(
         sample_rate=sr,
         duration=0.5,
     )
     resampled = F.resample(waveform,
                            sr,
                            sr,
                            resampling_method=resampling_method)
     assert resampled.size(-1) == waveform.size(-1)
Exemple #7
0
 def test_resample_waveform_downsample_size(self, resampling_method):
     sr = 16000
     waveform = get_whitenoise(
         sample_rate=sr,
         duration=0.5,
     )
     downsampled = F.resample(waveform,
                              sr,
                              sr // 2,
                              resampling_method=resampling_method)
     assert downsampled.size(-1) == waveform.size(-1) // 2
Exemple #8
0
    def forward(self, waveform: Tensor) -> Tensor:
        r"""
        Args:
            waveform (Tensor): Tensor of audio of dimension (..., time).

        Returns:
            Tensor: Output signal of dimension (..., time).
        """
        if self.resampling_method == 'sinc_interpolation':
            return F.resample(waveform, self.orig_freq, self.new_freq)

        raise ValueError('Invalid resampling method: {}'.format(self.resampling_method))
Exemple #9
0
    def _test_resample_waveform_accuracy(
            self,
            up_scale_factor=None,
            down_scale_factor=None,
            resampling_method="sinc_interpolation",
            atol=1e-1,
            rtol=1e-4):
        # resample the signal and compare it to the ground truth
        n_to_trim = 20
        sample_rate = 1000
        new_sample_rate = sample_rate

        if up_scale_factor is not None:
            new_sample_rate *= up_scale_factor

        if down_scale_factor is not None:
            new_sample_rate //= down_scale_factor

        duration = 5  # seconds
        original_timestamps = torch.arange(0, duration, 1.0 / sample_rate)

        sound = 123 * torch.cos(
            2 * math.pi * 3 * original_timestamps).unsqueeze(0)
        estimate = F.resample(sound,
                              sample_rate,
                              new_sample_rate,
                              resampling_method=resampling_method).squeeze()

        new_timestamps = torch.arange(0, duration,
                                      1.0 / new_sample_rate)[:estimate.size(0)]
        ground_truth = 123 * torch.cos(2 * math.pi * 3 * new_timestamps)

        # trim the first/last n samples as these points have boundary effects
        ground_truth = ground_truth[..., n_to_trim:-n_to_trim]
        estimate = estimate[..., n_to_trim:-n_to_trim]

        self.assertEqual(estimate, ground_truth, atol=atol, rtol=rtol)
 def func(tensor):
     sr1, sr2 = 16000., 8000.
     return F.resample(tensor,
                       sr1,
                       sr2,
                       resampling_method="kaiser_window")
 def func(tensor):
     sr1, sr2 = 16000., 8000.
     return F.resample(tensor,
                       sr1,
                       sr2,
                       resampling_method="sinc_interpolation")
Exemple #12
0
    def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
        if isinstance(inputs, str):
            with open(inputs, "rb") as f:
                inputs = f.read()

        if isinstance(inputs, bytes):
            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)

        stride = None
        extra = {}
        if isinstance(inputs, dict):
            stride = inputs.pop("stride", None)
            _inputs = inputs.pop("raw")
            in_sampling_rate = inputs.pop("sampling_rate")
            extra = inputs
            inputs = _inputs
            if in_sampling_rate != self.feature_extractor.sampling_rate:
                import torch
                from torchaudio import functional as F

                inputs = F.resample(
                    torch.from_numpy(inputs), in_sampling_rate,
                    self.feature_extractor.sampling_rate).numpy()
                ratio = self.feature_extractor.sampling_rate / in_sampling_rate
            else:
                ratio = 1
            if stride is not None:
                if stride[0] + stride[1] > inputs.shape[0]:
                    raise ValueError("Stride is too large for input")

                # Stride needs to get the chunk length here, it's going to get
                # swallowed by the `feature_extractor` later, and then batching
                # can add extra data in the inputs, so we need to keep track
                # of the original length in the stride so we can cut properly.
                stride = (inputs.shape[0], int(round(stride[0] * ratio)),
                          int(round(stride[1] * ratio)))
        if not isinstance(inputs, np.ndarray):
            raise ValueError(
                f"We expect a numpy ndarray as input, got `{type(inputs)}`")
        if len(inputs.shape) != 1:
            raise ValueError(
                "We expect a single channel audio input for AutomaticSpeechRecognitionPipeline"
            )

        if chunk_length_s:
            if stride_length_s is None:
                stride_length_s = chunk_length_s / 6

            if isinstance(stride_length_s, (int, float)):
                stride_length_s = [stride_length_s, stride_length_s]

            # XXX: Carefuly, this variable will not exist in `seq2seq` setting.
            # Currently chunking is not possible at this level for `seq2seq` so
            # it's ok.
            align_to = self.model.config.inputs_to_logits_ratio
            chunk_len = int(
                round(chunk_length_s * self.feature_extractor.sampling_rate /
                      align_to)) * align_to
            stride_left = int(
                round(stride_length_s[0] * self.feature_extractor.sampling_rate
                      / align_to)) * align_to
            stride_right = int(
                round(stride_length_s[1] * self.feature_extractor.sampling_rate
                      / align_to)) * align_to

            if self.type not in {"ctc", "ctc_with_lm"}:
                raise ValueError(
                    "`chunk_length_s` is only valid for CTC models, use other chunking options for other models"
                )
            if chunk_len < stride_left + stride_right:
                raise ValueError(
                    "Chunk length must be superior to stride length")

            # make sure that
            for item in chunk_iter(inputs, self.feature_extractor, chunk_len,
                                   stride_left, stride_right):
                yield item
        else:
            processed = self.feature_extractor(
                inputs,
                sampling_rate=self.feature_extractor.sampling_rate,
                return_tensors="pt")
            if stride is not None:
                if self.model.__class__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.values(
                ):
                    raise ValueError(
                        "Stride is only usable with CTC models, try removing it"
                    )

                processed["stride"] = stride
            yield {"is_last": True, **processed, **extra}
Exemple #13
0
    def test_resample_identity(self, resampling_method, sample_rate):
        waveform = get_whitenoise(sample_rate=sample_rate, duration=1)

        resampled = F.resample(waveform, sample_rate, sample_rate)
        self.assertEqual(waveform, resampled)
Exemple #14
0
#
# Because the filter used for interpolation extends infinitely, the
# ``lowpass_filter_width`` parameter is used to control for the width of
# the filter to use to window the interpolation. It is also referred to as
# the number of zero crossings, since the interpolation passes through
# zero at every time unit. Using a larger ``lowpass_filter_width``
# provides a sharper, more precise filter, but is more computationally
# expensive.
#


sample_rate = 48000
resample_rate = 32000

resampled_waveform = F.resample(
    waveform, sample_rate, resample_rate, lowpass_filter_width=6
)
plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=6")

resampled_waveform = F.resample(
    waveform, sample_rate, resample_rate, lowpass_filter_width=128
)
plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=128")


######################################################################
# Rolloff
# ~~~~~~~
#
# The ``rolloff`` parameter is represented as a fraction of the Nyquist
# frequency, which is the maximal frequency representable by a given