def get_train_transforms( config: object, transforms_set: TformsSet = TformsSet.Audtorch) -> object: if config.use_mels: if transforms_set == TformsSet.TorchAudio: trans = tforms_vision.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_torch.MelSpectrogram(sample_rate=config.resampling_rate, n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, f_min=float(config.fmin), f_max=float(config.fmax), pad=0, n_mels=config.n_mels), tforms_torch.AmplitudeToDB(stype='power', top_db=80), #tforms_aud.RandomCrop(config.max_length_frames), # Raises "Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead." ]) elif transforms_set == TformsSet.MySet: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_mine.Spectrogram(config), tforms_aud.RandomCrop(config.max_length_frames) ]) else: if transforms_set == TformsSet.TorchAudio: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_torch.Spectrogram(n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, pad=0, power=2, normalized=True), tforms_torch.AmplitudeToDB(stype='power', top_db=80), tforms_aud.RandomCrop(config.max_length_frames) ]) elif transforms_set == TformsSet.MySet: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_mine.Spectrogram(config), tforms_aud.RandomCrop(config.max_length_frames) ]) return trans
def test_compose(input, idx, axis): t = transforms.Compose( [transforms.Crop(idx, axis=axis), transforms.Normalize(axis=axis)]) expected_output = F.crop(input, idx, axis=axis) expected_output = F.normalize(expected_output, axis=axis) assert np.array_equal(t(input), expected_output)
root = '/m/cs/work/falconr1/datasets/mtg-jamendo-dataset-master' split = 0 start_id = config.start_id * config.chunk_size stop_id = start_id + config.chunk_size full_output_path = os.path.join(root, config.output_path) dataset = JamendoAudioFolder_audtorch( root, config.subset, split, config.mode, return_fname=True, transform=tforms2.Compose([ tforms2.RandomCrop(config.max_length), tforms2.Downmix(1), tforms2.Normalize() ]), ) ## TODO use the audio dataset, not the npy # dataset = JamendoAudioFolder_npy(root, # config.subset, # split, # config.mode, # trim_to_size=config.max_length, # return_fname=True, # transform=tforms2.Compose([ # tforms2.Downmix(1), # tforms2.Normalize()]
sampling_rate=sampling_rate, mean=mean, stdev=stdev) noise, label = next(iter(dataset)) samples = int(np.ceil(duration * sampling_rate)) assert noise.shape == (1, samples) assert label == 'white noise' assert -1 <= np.max(np.abs(noise)) <= 1 assert len(dataset) == 1 # --- datasets/utils.py --- crop = transforms.RandomCrop(8192) resamp1 = transforms.Resample(48000, 44100) resamp2 = transforms.Resample(44100, 16000) t1 = transforms.Compose([crop, resamp1]) t2 = transforms.Compose([crop, resamp1, resamp2]) t3 = transforms.Compose([resamp1, crop, resamp2]) d0 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=crop) d1 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=t1) d2 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=t2) d3 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=t3) df_empty = pd.DataFrame() df_a = pd.DataFrame(data=[0], columns=['a']) df_ab = pd.DataFrame(data=[('0', 1)], columns=['a', 'b']) @pytest.mark.parametrize('list_of_datasets', [ (d2, d3), pytest.param([d0, d1], marks=xfail(raises=ValueError)) ]) def test_audioconcatdataset(list_of_datasets):
def get_train_transforms(config: object, set: TformsSet = TformsSet.Audtorch) -> object: if config.use_mels: if set == TformsSet.TorchAudio: trans = transforms.Compose([ tforms2.Crop((441000, 441000 + 441000)), tforms.MelSpectrogram(sample_rate=config.resampling_rate, n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, f_min=float(config.fmin), f_max=float(config.fmax), pad=0, n_mels=config.n_mels), tforms.AmplitudeToDB(stype='power', top_db=80), # transforms.ToPILImage(), # transforms.RandomCrop((96, 256), pad_if_needed=True, # padding_mode='reflect'), # transforms.ToTensor(), ]) elif set == TformsSet.Audtorch: ## no real mel spectrogram in audtorch trans = tforms2.Compose([ myTforms.ToNumpy(), tforms2.Crop((441000, 441000 + 441000)), # tforms2.Normalize(), tforms2.Spectrogram( window_size=config.hop_length, hop_size=config.hop_length, fft_size=config.n_fft, ), tforms2.Log(), myTforms.ToTensor(), tforms.AmplitudeToDB(stype='magnitude', top_db=80) ]) elif set == TformsSet.MySet: trans = tforms2.Compose([ tforms2.Crop((441000, 441000 + 441000)), myTforms.Spectrogram(config) ]) else: if set == TformsSet.TorchAudio: trans = transforms.Compose([ tforms2.Crop((441000, 441000 + 441000)), tforms.Spectrogram(n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, pad=0, power=2, normalized=True), tforms.AmplitudeToDB(stype='power', top_db=80), # tforms.MelSpectrogram(sample_rate=config.resampling_rate, # n_fft=config.n_fft, # win_length=config.hop_length, # hop_length=config.hop_length, # f_min=float(config.fmin), # f_max=float(config.fmax), # pad=0, # n_mels=config.n_mels), #transforms.ToPILImage(), #transforms.RandomCrop((96, 256), pad_if_needed=True, # padding_mode='reflect'), #transforms.ToTensor(), ]) elif set == TformsSet.Audtorch: trans = tforms2.Compose([ myTforms.ToNumpy(), tforms2.Crop((441000, 441000 + 441000)), #tforms2.Normalize(), tforms2.Spectrogram( window_size=config.hop_length, hop_size=config.hop_length, fft_size=config.n_fft, ), myTforms.ToTensor(), tforms.AmplitudeToDB(stype='magnitude', top_db=80) ]) elif set == TformsSet.MySet: trans = tforms2.Compose([ tforms2.Crop((441000, 441000 + 441000)), myTforms.Spectrogram(config) ]) return trans
def compareTforms(config): ''' Here I compare different transfromations sets for spectrograms, using (torchaudio, audtorch, and my own custom spectrogram using librosa. This codes is applied to a sample audio file from the librispeech dataset. This code was done mostly to post as an issue in github. As a minimal working example. ''' config.use_mels = False config.win_length = 400 config.hop_length = 400 config.n_fft = 2048 config.resampling_rate = 16000 augment1 = tforms2.Compose([ myTforms.ToTensor(), tforms.Spectrogram( n_fft=2048, win_length=400, # 400 samples @ 16k = 25 ms, hop_length=400, pad=0, power=2, normalized=False), tforms.AmplitudeToDB(stype='power', top_db=80) ]) augment2 = tforms2.Compose([ tforms2.Spectrogram( window_size=400, # 400 samples @ 16k = 25 ms hop_size=400, fft_size=2048), myTforms.ToTensor(), tforms.AmplitudeToDB(stype='magnitude', top_db=80) ]) augment3 = tforms2.Compose([myTforms.Spectrogram(config)]) data1 = dsets.LibriSpeech( root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech', sets='dev-clean', download=False, transform=augment1) data2 = dsets.LibriSpeech( root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech', sets='dev-clean', download=False, transform=augment2) data3 = dsets.LibriSpeech( root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech', sets='dev-clean', download=False, transform=augment3) plt.figure(figsize=(16, 8)) titles = ['torchaudio', 'audtorch', 'myset'] for i, data in enumerate([data1, data2, data3]): spec, label = data[0] if isinstance(spec, torch.Tensor): spec = spec.numpy() plt.subplot(1, 3, i + 1) plt.imshow(spec.squeeze(), interpolation='nearest', cmap='inferno', origin='lower', aspect='auto') plt.colorbar() plt.title(titles[i]) plt.savefig(os.path.join('./results', 'Test_Output_compare_specs.png')) plt.show()