예제 #1
0
def get_train_transforms(
        config: object,
        transforms_set: TformsSet = TformsSet.Audtorch) -> object:
    if config.use_mels:
        if transforms_set == TformsSet.TorchAudio:
            trans = tforms_vision.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_torch.MelSpectrogram(sample_rate=config.resampling_rate,
                                            n_fft=config.n_fft,
                                            win_length=config.hop_length,
                                            hop_length=config.hop_length,
                                            f_min=float(config.fmin),
                                            f_max=float(config.fmax),
                                            pad=0,
                                            n_mels=config.n_mels),
                tforms_torch.AmplitudeToDB(stype='power', top_db=80),
                #tforms_aud.RandomCrop(config.max_length_frames),  # Raises "Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead."
            ])
        elif transforms_set == TformsSet.MySet:  # this works
            trans = tforms_aud.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_mine.Spectrogram(config),
                tforms_aud.RandomCrop(config.max_length_frames)
            ])
    else:
        if transforms_set == TformsSet.TorchAudio:  # this works
            trans = tforms_aud.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_torch.Spectrogram(n_fft=config.n_fft,
                                         win_length=config.hop_length,
                                         hop_length=config.hop_length,
                                         pad=0,
                                         power=2,
                                         normalized=True),
                tforms_torch.AmplitudeToDB(stype='power', top_db=80),
                tforms_aud.RandomCrop(config.max_length_frames)
            ])
        elif transforms_set == TformsSet.MySet:  # this works
            trans = tforms_aud.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_mine.Spectrogram(config),
                tforms_aud.RandomCrop(config.max_length_frames)
            ])
    return trans
예제 #2
0
def test_compose(input, idx, axis):
    t = transforms.Compose(
        [transforms.Crop(idx, axis=axis),
         transforms.Normalize(axis=axis)])
    expected_output = F.crop(input, idx, axis=axis)
    expected_output = F.normalize(expected_output, axis=axis)
    assert np.array_equal(t(input), expected_output)
예제 #3
0
    root = '/m/cs/work/falconr1/datasets/mtg-jamendo-dataset-master'

split = 0
start_id = config.start_id * config.chunk_size
stop_id = start_id + config.chunk_size
full_output_path = os.path.join(root, config.output_path)

dataset = JamendoAudioFolder_audtorch(
    root,
    config.subset,
    split,
    config.mode,
    return_fname=True,
    transform=tforms2.Compose([
        tforms2.RandomCrop(config.max_length),
        tforms2.Downmix(1),
        tforms2.Normalize()
    ]),
)

## TODO use the audio dataset, not the npy

# dataset = JamendoAudioFolder_npy(root,
#                                  config.subset,
#                                  split,
#                                  config.mode,
#                                  trim_to_size=config.max_length,
#                                  return_fname=True,
#                                  transform=tforms2.Compose([
#                                            tforms2.Downmix(1),
#                                            tforms2.Normalize()]
예제 #4
0
                                  sampling_rate=sampling_rate,
                                  mean=mean,
                                  stdev=stdev)
    noise, label = next(iter(dataset))
    samples = int(np.ceil(duration * sampling_rate))
    assert noise.shape == (1, samples)
    assert label == 'white noise'
    assert -1 <= np.max(np.abs(noise)) <= 1
    assert len(dataset) == 1


# --- datasets/utils.py ---
crop = transforms.RandomCrop(8192)
resamp1 = transforms.Resample(48000, 44100)
resamp2 = transforms.Resample(44100, 16000)
t1 = transforms.Compose([crop, resamp1])
t2 = transforms.Compose([crop, resamp1, resamp2])
t3 = transforms.Compose([resamp1, crop, resamp2])
d0 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=crop)
d1 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=t1)
d2 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=t2)
d3 = datasets.WhiteNoise(duration=0.5, sampling_rate=48000, transform=t3)
df_empty = pd.DataFrame()
df_a = pd.DataFrame(data=[0], columns=['a'])
df_ab = pd.DataFrame(data=[('0', 1)], columns=['a', 'b'])


@pytest.mark.parametrize('list_of_datasets', [
    (d2, d3), pytest.param([d0, d1], marks=xfail(raises=ValueError))
])
def test_audioconcatdataset(list_of_datasets):
예제 #5
0
def get_train_transforms(config: object,
                         set: TformsSet = TformsSet.Audtorch) -> object:
    if config.use_mels:
        if set == TformsSet.TorchAudio:
            trans = transforms.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                tforms.MelSpectrogram(sample_rate=config.resampling_rate,
                                      n_fft=config.n_fft,
                                      win_length=config.hop_length,
                                      hop_length=config.hop_length,
                                      f_min=float(config.fmin),
                                      f_max=float(config.fmax),
                                      pad=0,
                                      n_mels=config.n_mels),
                tforms.AmplitudeToDB(stype='power', top_db=80),
                # transforms.ToPILImage(),
                # transforms.RandomCrop((96, 256), pad_if_needed=True,
                #                      padding_mode='reflect'),
                # transforms.ToTensor(),
            ])
        elif set == TformsSet.Audtorch:  ## no real mel spectrogram in audtorch
            trans = tforms2.Compose([
                myTforms.ToNumpy(),
                tforms2.Crop((441000, 441000 + 441000)),
                # tforms2.Normalize(),
                tforms2.Spectrogram(
                    window_size=config.hop_length,
                    hop_size=config.hop_length,
                    fft_size=config.n_fft,
                ),
                tforms2.Log(),
                myTforms.ToTensor(),
                tforms.AmplitudeToDB(stype='magnitude', top_db=80)
            ])
        elif set == TformsSet.MySet:
            trans = tforms2.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                myTforms.Spectrogram(config)
            ])
    else:
        if set == TformsSet.TorchAudio:
            trans = transforms.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                tforms.Spectrogram(n_fft=config.n_fft,
                                   win_length=config.hop_length,
                                   hop_length=config.hop_length,
                                   pad=0,
                                   power=2,
                                   normalized=True),
                tforms.AmplitudeToDB(stype='power', top_db=80),
                # tforms.MelSpectrogram(sample_rate=config.resampling_rate,
                #                       n_fft=config.n_fft,
                #                       win_length=config.hop_length,
                #                       hop_length=config.hop_length,
                #                       f_min=float(config.fmin),
                #                       f_max=float(config.fmax),
                #                       pad=0,
                #                       n_mels=config.n_mels),

                #transforms.ToPILImage(),
                #transforms.RandomCrop((96, 256), pad_if_needed=True,
                #                      padding_mode='reflect'),
                #transforms.ToTensor(),
            ])
        elif set == TformsSet.Audtorch:
            trans = tforms2.Compose([
                myTforms.ToNumpy(),
                tforms2.Crop((441000, 441000 + 441000)),
                #tforms2.Normalize(),
                tforms2.Spectrogram(
                    window_size=config.hop_length,
                    hop_size=config.hop_length,
                    fft_size=config.n_fft,
                ),
                myTforms.ToTensor(),
                tforms.AmplitudeToDB(stype='magnitude', top_db=80)
            ])
        elif set == TformsSet.MySet:
            trans = tforms2.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                myTforms.Spectrogram(config)
            ])
    return trans
예제 #6
0
def compareTforms(config):
    '''
    Here I compare different transfromations sets for spectrograms, using (torchaudio, audtorch, and my own custom
    spectrogram using librosa. This codes is applied to a sample audio file from the librispeech dataset.

    This code was done mostly to post as an issue in github. As a minimal working example.
    '''
    config.use_mels = False
    config.win_length = 400
    config.hop_length = 400
    config.n_fft = 2048
    config.resampling_rate = 16000
    augment1 = tforms2.Compose([
        myTforms.ToTensor(),
        tforms.Spectrogram(
            n_fft=2048,
            win_length=400,  # 400 samples @ 16k = 25 ms,
            hop_length=400,
            pad=0,
            power=2,
            normalized=False),
        tforms.AmplitudeToDB(stype='power', top_db=80)
    ])

    augment2 = tforms2.Compose([
        tforms2.Spectrogram(
            window_size=400,  # 400 samples @ 16k = 25 ms
            hop_size=400,
            fft_size=2048),
        myTforms.ToTensor(),
        tforms.AmplitudeToDB(stype='magnitude', top_db=80)
    ])

    augment3 = tforms2.Compose([myTforms.Spectrogram(config)])

    data1 = dsets.LibriSpeech(
        root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech',
        sets='dev-clean',
        download=False,
        transform=augment1)
    data2 = dsets.LibriSpeech(
        root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech',
        sets='dev-clean',
        download=False,
        transform=augment2)
    data3 = dsets.LibriSpeech(
        root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech',
        sets='dev-clean',
        download=False,
        transform=augment3)

    plt.figure(figsize=(16, 8))

    titles = ['torchaudio', 'audtorch', 'myset']
    for i, data in enumerate([data1, data2, data3]):
        spec, label = data[0]

        if isinstance(spec, torch.Tensor):
            spec = spec.numpy()

        plt.subplot(1, 3, i + 1)
        plt.imshow(spec.squeeze(),
                   interpolation='nearest',
                   cmap='inferno',
                   origin='lower',
                   aspect='auto')
        plt.colorbar()
        plt.title(titles[i])

    plt.savefig(os.path.join('./results', 'Test_Output_compare_specs.png'))
    plt.show()