def test_mask_time(): c, f, t = 2, 120, 80 min_size = 5 max_size = 7 sg = AudioSpectrogram(torch.rand([c, f, t])) val = 10 # Use a value not in the original spectrogram gradient_sg = AudioSpectrogram( torch.linspace(0, 1, t).view(1, 1, t).repeat([c, f, 1])) ones = torch.ones_like(sg) # Test patching with mean with patch( "fastaudio.augment.functional.region_mask", side_effect=[ torch.BoolTensor([[1] * 10 + [0] * (t - 10)]), ], ): mask_with_mean = MaskTimeGPU(min_size=min_size, max_size=max_size, mask_val=None) # Use a gradient so we can be sure the mean will never show up outside the mask inp, out = apply_transform(mask_with_mean, gradient_sg) channelwise_mean = inp[..., :10].mean(dim=(-2, -1)).reshape(-1, 1, 1) _test_close( out[..., :10], channelwise_mean * ones[..., :10], ) assert not (out[..., 10:] == channelwise_mean).any(), out == channelwise_mean # Test multiple masks (and patching with value) with patch( "fastaudio.augment.functional.region_mask", side_effect=[ torch.BoolTensor([[1] * 10 + [0] * (t - 10), [0] * (t - 10) + [1] * 10]), ], ): mask_with_val = MaskTimeGPU(min_size=min_size, num_masks=2, max_size=max_size, mask_val=val) inp, out = apply_transform(mask_with_val, sg) _test_eq( out[..., :10], val * ones[..., :10], ) _test_eq( out[..., t - 10:], val * ones[..., t - 10:], ) matches = out[..., 10:t - 10] == val assert not matches.any(), matches
def test_resizing_signal(): "Can use the ResizeSignal Transform" audio = test_audio_tensor(seconds=10, sr=1000) mcaudio = test_audio_tensor(channels=2) for i in [1, 2, 5]: inp, out = apply_transform(ResizeSignal(i * 1000), audio) _test_eq(out.duration, i) _test_eq(out.nsamples, out.duration * inp.sr) inp, out = apply_transform(ResizeSignal(i * 1000), mcaudio) _test_eq(out.duration, i)
def test_crop_time_after_padding(): sg_orig = test_audio_tensor() a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(sg_orig) crop_time = CropTime((sg.duration + 5) * 1000, pad_mode=AudioPadType.Zeros_After) inp, out = apply_transform(crop_time, sg.clone()) _test_ne(sg.duration, sg_orig.duration)
def test_resample_rates(audio): "Test and hear realistic sample rates" for rate in [2000, 4000, 8000, 22050, 44100]: resampler = Resample(rate) inp, out = apply_transform(resampler, audio) assert rate == out.sr assert out.nsamples == inp.duration * rate
def test_resample_multi_channel(audio): audio = test_audio_tensor(channels=3) resampler = Resample(8000) _, out = apply_transform(resampler, audio) _test_eq(out.nsamples, out.duration * 8000) _test_eq(out.nchannels, 3) _test_eq(out.sr, 8000)
def test_crop_time(): for i in [1, 2, 5]: a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) audio = test_audio_tensor(seconds=3) crop = CropTime(i * 1000) inp, out = apply_transform(crop, a2s(audio)) _test_eq(i, round(out.duration)) _test_close(out.width, int((i / inp.duration) * inp.width), eps=1.01)
def test_resize_int(): # Test when size is an int size = 224 resize_int = TfmResize(size) audio = test_audio_tensor() a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(audio) inp, out = apply_transform(resize_int, sg) _test_eq(out.shape[1:], torch.Size([size, size]))
def test_padding_after_resize(audio): "Padding is added to the end but not the beginning" new_duration = (audio.duration + 1) * 1000 cropsig_pad_after = ResizeSignal(new_duration, pad_mode=AudioPadType.Zeros_After) # generate a random input signal that is 3s long inp, out = apply_transform(cropsig_pad_after, audio) # test end of signal is padded with zeros _test_eq(out[:, -10:], torch.zeros_like(out)[:, -10:]) # test front of signal is not padded with zeros _test_ne(out[:, 0:10], out[:, -10:])
def test_crop_time_repeat_padding(): "Test that repeat padding works when cropping time" repeat = 3 audio = test_audio_tensor() crop_12000ms_repeat = CropTime(repeat * 1000 * audio.duration, pad_mode=AudioPadType.Repeat) a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(audio) inp, out = apply_transform(crop_12000ms_repeat, sg) _test_eq(inp.width, sg.width) _test_ne(sg.width, out.width)
def test_delta_channels(): " nchannels for a spectrogram is how many channels its original audio had " delta = Delta() audio = test_audio_tensor(channels=1) a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(audio) inp, out = apply_transform(delta, sg) _test_eq(out.nchannels, inp.nchannels * 3) _test_eq(out.shape[1:], inp.shape[1:]) _test_ne(out[0], out[1])
def test_signal_cutout(): c, s = 2, 16000 min_cut_pct, max_cut_pct = 0.10, 0.15 # Create tensor with no zeros audio = AudioTensor(torch.rand([c, s]), sr=16000) * 0.9 + 0.1 cutout = SignalCutoutGPU(p=1.0, min_cut_pct=min_cut_pct, max_cut_pct=max_cut_pct) inp, out = apply_transform(cutout, audio) _test_ne(inp.data, out.data) num_zeros = (out == 0).sum() assert min_cut_pct * s * c <= num_zeros <= max_cut_pct * s * c, num_zeros
def test_mask_freq(): # create a random time mask and test that it is being correctly applied size, start, val = [random.randint(1, 50) for i in range(3)] time_mask_test = MaskTime(size=size, start=start, val=val) audio = test_audio_tensor() a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(audio) inp, out = apply_transform(time_mask_test, sg) _test_eq( out[:, :, start:start + size], val * torch.ones_like(inp)[:, :, start:start + size], )
def test_resize_signal_repeat(audio): """ Test pad_mode repeat by making sure that columns are equal at the appropriate offsets """ dur = audio.duration * 1000 repeat = 3 cropsig_repeat = ResizeSignal(dur * repeat, pad_mode=AudioPadType.Repeat) inp, out = apply_transform(cropsig_repeat, audio) for i in range(repeat): s = int(i * inp.nsamples) e = int(s + inp.nsamples) _test_eq(out[:, s:e], inp)
def test_delta_channels(): " nchannels for a spectrogram is how many channels its original audio had " delta = DeltaGPU() # Explicitly check more than one channel audio = test_audio_tensor(channels=2) a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(audio) inp, out = apply_transform(delta, sg) _test_eq(out.nchannels, inp.nchannels * 3) _test_eq(out.shape[-2:], inp.shape[-2:]) for i1, i2 in [(0, 2), (1, 3), (0, 4), (1, 5), (2, 4), (3, 5)]: assert not torch.allclose(out[i1], out[i2])
def test_cropping(): "Can use the ResizeSignal Transform" audio = test_audio_tensor(seconds=10, sr=1000) inp, out1000 = apply_transform(ResizeSignal(1000), audio.clone()) inp, out2000 = apply_transform(ResizeSignal(2000), audio.clone()) inp, out5000 = apply_transform(ResizeSignal(5000), audio.clone()) _test_eq(out1000.duration, 1) _test_eq(out2000.duration, 2) _test_eq(out5000.duration, 5) _test_eq(out1000.nsamples, out1000.duration * inp.sr) _test_eq(out2000.nsamples, out2000.duration * inp.sr) _test_eq(out5000.nsamples, out5000.duration * inp.sr) # Multi Channel Cropping inp, mc1000 = apply_transform(ResizeSignal(1000), audio.clone()) inp, mc2000 = apply_transform(ResizeSignal(2000), audio.clone()) inp, mc5000 = apply_transform(ResizeSignal(5000), audio.clone()) _test_eq(mc1000.duration, 1) _test_eq(mc2000.duration, 2) _test_eq(mc5000.duration, 5)
def test_noise_non_white(audio): addnoise = AddNoise(color=NoiseColor.Pink) inp, out = apply_transform(addnoise, audio) _test_ne(inp.data, out.data)
def test_resample(audio): no_resample_needed = Resample(audio.sr) inp, out = apply_transform(no_resample_needed, audio) assert inp.sr == out.sr _test_eq(inp.data, out.data)
def test_silence_removed(audio): "Add silence to a signal and test that it gets removed" silencer = RemoveSilence(threshold=20, pad_ms=20) orig, silenced = apply_transform(silencer, audio) assert silenced.nsamples <= orig.nsamples
def test_signal_cutout(audio): cutout = SignalCutout(1) inp, out = apply_transform(cutout, audio) _test_ne(inp.data, out.data)
def test_signal_loss(audio): signalloss = SignalLossGPU(1) inp, out = apply_transform(signalloss, audio) _test_ne(inp.data, out.data)
def test_change_volume(audio): changevol = ChangeVolumeGPU(1) inp, out = apply_transform(changevol, audio) _test_ne(inp.data, out.data)
def test_padding_both_side_resize(audio): "Make sure they are padding on both sides" new_duration = (audio.duration + 1) * 1000 cropsig_pad_after = ResizeSignal(new_duration) inp, out = apply_transform(cropsig_pad_after, audio) _test_eq(out[:, 0:2], out[:, -2:])
def test_noise_white(audio): addnoise = AddNoiseGPU(color=NoiseColor.White, p=1.0, min_level=0.1, max_level=0.2) inp, out = apply_transform(addnoise, audio) _test_ne(inp.data, out.data)
def test_down_mix_mono(audio): "Test downmixing 1 channel has no effect" downmixer = DownmixMono() inp, out = apply_transform(downmixer, audio) _test_eq(inp.data, out.data)
def test_no_rolling(audio): shift_and_roll = SignalShifter(p=1, max_pct=0.5, roll=False) inp, out = apply_transform(shift_and_roll, audio) _test_eq(inp.data.shape, out.data.shape)
def test_shift_max_time(audio): shift = SignalShifter(max_time=1) inp, out = apply_transform(shift, audio) _test_eq(inp.data.shape, out.data.shape)
def test_signal_shift_on_sg(): audio = test_audio_tensor() a2s = AudioToSpec.from_cfg(AudioConfig.BasicSpectrogram()) shifter = SignalShifter(1, 1) inp, out = apply_transform(shifter, a2s(audio)) _test_ne(inp, out)
def test_sg_roll(): roll = SGRoll() audio = test_audio_tensor() a2s = AudioToSpec.from_cfg(AudioConfig.BasicSpectrogram()) inp, out = apply_transform(roll, a2s(audio)) _test_ne(inp, out)
def test_resize_same_duration(audio): "Asking to resize to the duration should return the audio back" resize = ResizeSignal(audio.duration * 1000) inp, out = apply_transform(resize, audio) _test_eq(inp, out)
def test_noise_non_white(audio): # White noise uses a different method to other noises, so test both. addnoise = AddNoiseGPU(color=NoiseColor.Pink, p=1.0, min_level=0.1, max_level=0.2) inp, out = apply_transform(addnoise, audio) _test_ne(inp.data, out.data)