def test_item_noise_not_applied_in_valid(audio): add_noise = AddNoise(p=1.0) test_aud = AudioTensor(torch.ones_like(audio), 16000) train_out = add_noise(test_aud.clone(), split_idx=0) val_out = add_noise(test_aud.clone(), split_idx=1) _test_ne(test_aud, train_out) _test_eq(test_aud, val_out)
def test_crop_time(): for i in [1, 2, 5]: a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) audio = test_audio_tensor(seconds=3) crop = CropTime(i * 1000) inp, out = apply_transform(crop, a2s(audio)) _test_eq(i, round(out.duration)) _test_close(out.width, int((i / inp.duration) * inp.width), eps=1.01)
def test_resize_int(): # Test when size is an int size = 224 resize_int = TfmResize(size) audio = test_audio_tensor() a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(audio) inp, out = apply_transform(resize_int, sg) _test_eq(out.shape[1:], torch.Size([size, size]))
def test_padding_after_resize(audio): "Padding is added to the end but not the beginning" new_duration = (audio.duration + 1) * 1000 cropsig_pad_after = ResizeSignal(new_duration, pad_mode=AudioPadType.Zeros_After) # generate a random input signal that is 3s long inp, out = apply_transform(cropsig_pad_after, audio) # test end of signal is padded with zeros _test_eq(out[:, -10:], torch.zeros_like(out)[:, -10:]) # test front of signal is not padded with zeros _test_ne(out[:, 0:10], out[:, -10:])
def test_mask_time(): c, f, t = 2, 120, 80 min_size = 5 max_size = 7 sg = AudioSpectrogram(torch.rand([c, f, t])) val = 10 # Use a value not in the original spectrogram gradient_sg = AudioSpectrogram( torch.linspace(0, 1, t).view(1, 1, t).repeat([c, f, 1])) ones = torch.ones_like(sg) # Test patching with mean with patch( "fastaudio.augment.functional.region_mask", side_effect=[ torch.BoolTensor([[1] * 10 + [0] * (t - 10)]), ], ): mask_with_mean = MaskTimeGPU(min_size=min_size, max_size=max_size, mask_val=None) # Use a gradient so we can be sure the mean will never show up outside the mask inp, out = apply_transform(mask_with_mean, gradient_sg) channelwise_mean = inp[..., :10].mean(dim=(-2, -1)).reshape(-1, 1, 1) _test_close( out[..., :10], channelwise_mean * ones[..., :10], ) assert not (out[..., 10:] == channelwise_mean).any(), out == channelwise_mean # Test multiple masks (and patching with value) with patch( "fastaudio.augment.functional.region_mask", side_effect=[ torch.BoolTensor([[1] * 10 + [0] * (t - 10), [0] * (t - 10) + [1] * 10]), ], ): mask_with_val = MaskTimeGPU(min_size=min_size, num_masks=2, max_size=max_size, mask_val=val) inp, out = apply_transform(mask_with_val, sg) _test_eq( out[..., :10], val * ones[..., :10], ) _test_eq( out[..., t - 10:], val * ones[..., t - 10:], ) matches = out[..., 10:t - 10] == val assert not matches.any(), matches
def test_crop_time_repeat_padding(): "Test that repeat padding works when cropping time" repeat = 3 audio = test_audio_tensor() crop_12000ms_repeat = CropTime(repeat * 1000 * audio.duration, pad_mode=AudioPadType.Repeat) a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(audio) inp, out = apply_transform(crop_12000ms_repeat, sg) _test_eq(inp.width, sg.width) _test_ne(sg.width, out.width)
def test_delta_channels(): " nchannels for a spectrogram is how many channels its original audio had " delta = Delta() audio = test_audio_tensor(channels=1) a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(audio) inp, out = apply_transform(delta, sg) _test_eq(out.nchannels, inp.nchannels * 3) _test_eq(out.shape[1:], inp.shape[1:]) _test_ne(out[0], out[1])
def test_mask_freq(): # create a random time mask and test that it is being correctly applied size, start, val = [random.randint(1, 50) for i in range(3)] time_mask_test = MaskTime(size=size, start=start, val=val) audio = test_audio_tensor() a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(audio) inp, out = apply_transform(time_mask_test, sg) _test_eq( out[:, :, start:start + size], val * torch.ones_like(inp)[:, :, start:start + size], )
def test_delta_channels(): " nchannels for a spectrogram is how many channels its original audio had " delta = DeltaGPU() # Explicitly check more than one channel audio = test_audio_tensor(channels=2) a2s = AudioToSpec.from_cfg(AudioConfig.Voice()) sg = a2s(audio) inp, out = apply_transform(delta, sg) _test_eq(out.nchannels, inp.nchannels * 3) _test_eq(out.shape[-2:], inp.shape[-2:]) for i1, i2 in [(0, 2), (1, 3), (0, 4), (1, 5), (2, 4), (3, 5)]: assert not torch.allclose(out[i1], out[i2])
def test_max(self): # Test max size with patch( "torch.rand", side_effect=[ torch.Tensor([[[[1.0]]]]), torch.Tensor([[[[0.0]]]]), ], ): _test_eq( region_mask(1, 4, 6, 10), torch.BoolTensor([[[[1] * 6 + [0] * 4]]]), )
def test_resize_signal_repeat(audio): """ Test pad_mode repeat by making sure that columns are equal at the appropriate offsets """ dur = audio.duration * 1000 repeat = 3 cropsig_repeat = ResizeSignal(dur * repeat, pad_mode=AudioPadType.Repeat) inp, out = apply_transform(cropsig_repeat, audio) for i in range(repeat): s = int(i * inp.nsamples) e = int(s + inp.nsamples) _test_eq(out[:, s:e], inp)
def test_multiple_masks(self): # Test multiple masks with patch( "torch.rand", side_effect=[ torch.Tensor([[1.0], [0.0]]), torch.Tensor([[0.0], [0.5]]), ], ): _test_eq( region_mask(2, 4, 6, 10), torch.BoolTensor([[1] * 6 + [0] * 4, [0] * 3 + [1] * 4 + [0] * 3]), )
def test_min(self): # Test min size with patch( "torch.rand", side_effect=[ torch.Tensor([0.0]), # Test start middle start here too torch.Tensor([0.5]), ], ): _test_eq( region_mask(1, 4, 6, 10), torch.BoolTensor([0] * 3 + [1] * 4 + [0] * 3), )
def test_crop_time_with_pipeline(ex_files): """ AudioToSpec->CropTime and ResizeSignal->AudioToSpec will result in same size images """ oa = OpenAudio(ex_files) crop_dur = random.randint(1000, 5000) DBMelSpec = SpectrogramTransformer(mel=True, to_db=True) pipe_cropsig = Pipeline( [oa, DBMelSpec(hop_length=128), CropTime(crop_dur)]) pipe_cropspec = Pipeline([ oa, ResizeSignal(crop_dur), DBMelSpec(hop_length=128), ]) for i in range(4): _test_eq(pipe_cropsig(i).width, pipe_cropspec(i).width)
def test_resample_multi_channel(audio): audio = test_audio_tensor(channels=3) resampler = Resample(8000) _, out = apply_transform(resampler, audio) _test_eq(out.nsamples, out.duration * 8000) _test_eq(out.nchannels, 3) _test_eq(out.sr, 8000)
def test_resizing_signal(): "Can use the ResizeSignal Transform" audio = test_audio_tensor(seconds=10, sr=1000) mcaudio = test_audio_tensor(channels=2) for i in [1, 2, 5]: inp, out = apply_transform(ResizeSignal(i * 1000), audio) _test_eq(out.duration, i) _test_eq(out.nsamples, out.duration * inp.sr) inp, out = apply_transform(ResizeSignal(i * 1000), mcaudio) _test_eq(out.duration, i)
def test_cropping(): "Can use the ResizeSignal Transform" audio = test_audio_tensor(seconds=10, sr=1000) for i in [1, 2, 5]: inp, out = apply_transform(ResizeSignal(i * 1000), audio.clone()) _test_eq(out.duration, i) _test_eq(out.nsamples, out.duration * inp.sr) # Multi Channel Cropping inp, mc = apply_transform(ResizeSignal(i * 1000), audio.clone()) _test_eq(mc.duration, i)
def test_shift(): t1 = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) t3 = torch.tensor( [ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30], ] ) b4 = torch.stack([t3, t3, t3, t3]) _test_eq(b4.shape, torch.Size([4, 3, 10])) _test_eq(_shift(t1, 4), torch.tensor([[0, 0, 0, 0, 1, 2, 3, 4, 5, 6]])) _test_eq( _shift(t3, -2), torch.tensor( [ [3, 4, 5, 6, 7, 8, 9, 10, 0, 0], [13, 14, 15, 16, 17, 18, 19, 20, 0, 0], [23, 24, 25, 26, 27, 28, 29, 30, 0, 0], ] ), )
def test_shape(self): _test_eq(region_mask(1, 5, 7, 10).shape, (1, 10)) _test_eq(region_mask(2, 3, 7, 12).shape, (2, 12)) _test_eq(region_mask(4, 0, 3, 3).shape, (4, 3))
def test_resample(audio): no_resample_needed = Resample(audio.sr) inp, out = apply_transform(no_resample_needed, audio) assert inp.sr == out.sr _test_eq(inp.data, out.data)
def test_cropping(): "Can use the ResizeSignal Transform" audio = test_audio_tensor(seconds=10, sr=1000) inp, out1000 = apply_transform(ResizeSignal(1000), audio.clone()) inp, out2000 = apply_transform(ResizeSignal(2000), audio.clone()) inp, out5000 = apply_transform(ResizeSignal(5000), audio.clone()) _test_eq(out1000.duration, 1) _test_eq(out2000.duration, 2) _test_eq(out5000.duration, 5) _test_eq(out1000.nsamples, out1000.duration * inp.sr) _test_eq(out2000.nsamples, out2000.duration * inp.sr) _test_eq(out5000.nsamples, out5000.duration * inp.sr) # Multi Channel Cropping inp, mc1000 = apply_transform(ResizeSignal(1000), audio.clone()) inp, mc2000 = apply_transform(ResizeSignal(2000), audio.clone()) inp, mc5000 = apply_transform(ResizeSignal(5000), audio.clone()) _test_eq(mc1000.duration, 1) _test_eq(mc2000.duration, 2) _test_eq(mc5000.duration, 5)
def test_down_mix_mono(audio): "Test downmixing 1 channel has no effect" downmixer = DownmixMono() inp, out = apply_transform(downmixer, audio) _test_eq(inp.data, out.data)
def test_no_rolling(audio): shift_and_roll = SignalShifter(p=1, max_pct=0.5, roll=False) inp, out = apply_transform(shift_and_roll, audio) _test_eq(inp.data.shape, out.data.shape)
def test_shift_max_time(audio): shift = SignalShifter(max_time=1) inp, out = apply_transform(shift, audio) _test_eq(inp.data.shape, out.data.shape)
def test_shift_with_zero(): _test_eq(_shift(torch.arange(1, 10), 0), torch.arange(1, 10))
def test_padding_both_side_resize(audio): "Make sure they are padding on both sides" new_duration = (audio.duration + 1) * 1000 cropsig_pad_after = ResizeSignal(new_duration) inp, out = apply_transform(cropsig_pad_after, audio) _test_eq(out[:, 0:2], out[:, -2:])
def test_resize_same_duration(audio): "Asking to resize to the duration should return the audio back" resize = ResizeSignal(audio.duration * 1000) inp, out = apply_transform(resize, audio) _test_eq(inp, out)