def get_train_transforms( config: object, transforms_set: TformsSet = TformsSet.Audtorch) -> object: if config.use_mels: if transforms_set == TformsSet.TorchAudio: trans = tforms_vision.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_torch.MelSpectrogram(sample_rate=config.resampling_rate, n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, f_min=float(config.fmin), f_max=float(config.fmax), pad=0, n_mels=config.n_mels), tforms_torch.AmplitudeToDB(stype='power', top_db=80), #tforms_aud.RandomCrop(config.max_length_frames), # Raises "Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead." ]) elif transforms_set == TformsSet.MySet: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_mine.Spectrogram(config), tforms_aud.RandomCrop(config.max_length_frames) ]) else: if transforms_set == TformsSet.TorchAudio: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_torch.Spectrogram(n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, pad=0, power=2, normalized=True), tforms_torch.AmplitudeToDB(stype='power', top_db=80), tforms_aud.RandomCrop(config.max_length_frames) ]) elif transforms_set == TformsSet.MySet: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_mine.Spectrogram(config), tforms_aud.RandomCrop(config.max_length_frames) ]) return trans
elif config.platform == 3: root = '/m/cs/work/falconr1/datasets/mtg-jamendo-dataset-master' split = 0 start_id = config.start_id * config.chunk_size stop_id = start_id + config.chunk_size full_output_path = os.path.join(root, config.output_path) dataset = JamendoAudioFolder_audtorch( root, config.subset, split, config.mode, return_fname=True, transform=tforms2.Compose([ tforms2.RandomCrop(config.max_length), tforms2.Downmix(1), tforms2.Normalize() ]), ) ## TODO use the audio dataset, not the npy # dataset = JamendoAudioFolder_npy(root, # config.subset, # split, # config.mode, # trim_to_size=config.max_length, # return_fname=True, # transform=tforms2.Compose([ # tforms2.Downmix(1),
@pytest.mark.parametrize('stdev', [1, 0.5]) def test_whitenoise(duration, sampling_rate, mean, stdev): dataset = datasets.WhiteNoise(duration=duration, sampling_rate=sampling_rate, mean=mean, stdev=stdev) noise, label = next(iter(dataset)) samples = int(np.ceil(duration * sampling_rate)) assert noise.shape == (1, samples) assert label == 'white noise' assert -1 <= np.max(np.abs(noise)) <= 1 assert len(dataset) == 1 # --- datasets/utils.py --- crop = transforms.RandomCrop(8192) resamp1 = transforms.Resample(48000, 44100) resamp2 = transforms.Resample(44100, 16000) t1 = transforms.Compose([crop, resamp1]) t2 = transforms.Compose([crop, resamp1, resamp2]) t3 = transforms.Compose([resamp1, crop, resamp2]) d0 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=crop) d1 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=t1) d2 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=t2) d3 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=t3) df_empty = pd.DataFrame() df_a = pd.DataFrame(data=[0], columns=['a']) df_ab = pd.DataFrame(data=[('0', 1)], columns=['a', 'b']) @pytest.mark.parametrize('list_of_datasets', [
def test_randomcrop(input, size, axis): t = transforms.RandomCrop(size, axis=axis) t.fix_randomization = True assert np.array_equal(t(input), t(input)) assert np.array_equal(t(input), F.crop(input, t.idx, axis=t.axis))
def get_DataLoader(config): train_transforms = get_Transforms() if config.platform == 0: root = '/Volumes/scratch/work/falconr1/datasets/mtg-jamendo-dataset-master' elif config.platform == 2: root = '/scratch/work/falconr1/datasets/mtg-jamendo-dataset-master' elif config.platform == 3: root = '/m/cs/work/falconr1/datasets/mtg-jamendo-dataset-master' subset = config.subset split = 0 mode = 'train' if config.dataset == 'JamendoSpecFolder': dataset = JamendoSpecFolder(root, subset, split, mode, spec_folder='data/processed/spec_npy', transform=train_transforms) elif config.dataset == 'JamendoSpecHDF5': dataset = JamendoSpecHDF5(root, subset, split, mode, train_transforms, hdf5_filename='data/processed/jamendo.hdf5') elif config.dataset == 'JamendoSpecLMDB': dataset = JamendoSpecLMDB(root, subset, split, mode, train_transforms, lmdb_path='data/processed/triton') elif config.dataset == 'JamendoSpecLMDBsubdir': dataset = JamendoSpecLMDBsubdir(root, subset, split, mode, train_transforms, lmdb_path='data/processed/chunks') elif config.dataset == 'fake': dataset = dset.FakeData(image_size=(1, 96, 1366), transform=transforms.Compose([ transforms.RandomCrop((96, 256), pad_if_needed=True, padding_mode='reflect'), transforms.ToTensor() ])) elif config.dataset == 'SVHN': dataset = dset.SVHN(root='/m/cs/work/falconr1/datasets/SVHN', transform=transforms.Compose([ transforms.RandomCrop((96, 256), pad_if_needed=True, padding_mode='reflect'), transforms.ToTensor() ]), download=True) elif config.dataset == 'JamendoAudioFolder_torchaudio': dataset = JamendoAudioFolder_torchaudio(root, subset, split, mode, transform=transforms.Compose([ tforms.MelSpectrogram(sr=44100, n_fft=512, ws=256, hop=256, f_min=20.0, f_max=8000, pad=0, n_mels=96), transforms.ToPILImage(), transforms.RandomCrop((96, 256), pad_if_needed=True, padding_mode='reflect'), transforms.ToTensor(), ]) ) elif config.dataset == 'JamendoAudioFolder_audtorch': dataset = JamendoAudioFolder_audtorch(root, subset, split, mode, ## transform=tforms2.RandomCrop(size=256*44100), # transform=tforms2.Compose([ # tforms2.Downmix(1), # tforms2.Normalize(), # tforms2.Spectrogram(window_size=256, # hop_size=256, # fft_size=512), # tforms2.Log(), # # tforms2.LogSpectrogram(window_size=256, # # hop_size=256, # # normalize=True), # myTforms.Debugger(), # myTforms.CFL2FLC(), # transforms.ToPILImage(), # transforms.RandomCrop((96, 256), pad_if_needed=True, padding_mode='reflect'), # transforms.ToTensor(), # ]) ) elif config.dataset == 'JamendoAudioFolder_npy': dataset = JamendoAudioFolder_npy(root, subset, split, mode, trim_to_size=config.trim_size, ###transform=tforms2.Downmix(1), #transform=tforms2.RandomCrop(size=30*44100), # transform=tforms2.Compose([ # tforms2.Downmix(1), # tforms2.Normalize(), # tforms2.Spectrogram(window_size=256, # hop_size=256, # fft_size=512), # tforms2.Log(), # # tforms2.LogSpectrogram(window_size=256, # # hop_size=256, # # normalize=True), # myTforms.Debugger(), # myTforms.CFL2FLC(), # transforms.ToPILImage(), # transforms.RandomCrop((96, 256), pad_if_needed=True, padding_mode='reflect'), # transforms.ToTensor(), # ]) ) elif config.dataset == 'JamendoAudioFolder_torch': dataset = JamendoAudioFolder_torch(root, subset, split, mode, ###transform=tforms2.Downmix(1), transform=tforms2.RandomCrop(size=30*44100), # transform=tforms2.Compose([ # tforms2.Downmix(1), # tforms2.Normalize(), # tforms2.Spectrogram(window_size=256, # hop_size=256, # fft_size=512), # tforms2.Log(), # # tforms2.LogSpectrogram(window_size=256, # # hop_size=256, # # normalize=True), # myTforms.Debugger(), # myTforms.CFL2FLC(), # transforms.ToPILImage(), # transforms.RandomCrop((96, 256), pad_if_needed=True, padding_mode='reflect'), # transforms.ToTensor(), # ]) ) subset_indices = np.random.choice(range(len(dataset)), config.data_limit, replace=False) print('------ Dataset length = {}, using {} samples.'.format(len(dataset), len(subset_indices))) if config.collate_fn == 'seq2seq': collate = Seq2Seq([-1,-1], batch_first=None, sort_sequences=False) #collate = Seq2Seq_short([-1, -1], batch_first=None, sort_sequences=False) else: collate = torch.utils.data.dataloader.default_collate dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, # shuffle=True, num_workers=config.num_workers, pin_memory=True, sampler=torch.utils.data.sampler.SubsetRandomSampler(subset_indices), collate_fn=collate, drop_last=True, ) return dataloader