Ejemplo n.º 1
0
    def test_batch_spectrogram(self):
        waveform, sample_rate = torchaudio.load(self.test_filepath)

        # Single then transform then batch
        expected = transforms.Spectrogram()(waveform).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = transforms.Spectrogram()(waveform.repeat(3, 1, 1))

        self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape))
        self.assertTrue(torch.allclose(computed, expected))
Ejemplo n.º 2
0
    def __init__(self, d, src_path, batch_size, device):
        super(VAE, self).__init__()
        '''
        =========== ARGUMENTS ===========
            > d - dimensionality of latent space
            > src_path - path to source samples
            > batch_size - number of training examples in single batch
            > device - CPU or GPU in use
        =================================
        '''
        self.enc = ResNetBigger(d=d)
        self.dec = nn.Sequential(nn.Linear(d, 40), nn.ReLU(),
                                 nn.Linear(40, 50), nn.Sigmoid())

        # For computing spectrogram then normalizing
        self.spectrogram = T.Spectrogram(
            n_fft=2048,
            win_length=None,
            hop_length=512,
            power=2,
        )
        self.amp_to_db = T.AmplitudeToDB(stype='power')

        self.src = torch.from_numpy(np.load(src_path))
        self.src = self.src.unsqueeze(0).repeat(batch_size, 1, 1)
        self.src = self.src.to(device)

        self.d = d
        self.device = device
Ejemplo n.º 3
0
def preprocess(file_path='../DATASETS/LJSpeech-1.1/metadata.csv',
               root_dir='../DATASETS/LJSpeech-1.1'):
    with open(file_path, encoding='utf8') as file:
        data_ = [line.strip().split('|') for line in file]
    root_dir = root_dir
    sample_rate = 8000
    resample = transforms.Resample(orig_freq=22050, new_freq=sample_rate)
    spectogram = transforms.Spectrogram(n_fft=1024, hop_length=256)
    to_mel = transforms.MelScale(n_mels=80,
                                 sample_rate=sample_rate,
                                 n_stft=1024 // 2 + 1)

    mel_data = torch.zeros(len(data_), 316, 80)
    mel_len = torch.empty(len(data_), dtype=torch.int)

    for idx, data in enumerate(tqdm(data_)):
        path, text = data[0], data[1]
        path = f'{root_dir}/wavs/{path}.wav'

        data, sample_rate = torchaudio.load(path)
        data = resample(data)
        data = spectogram(data)
        data = to_mel(data)
        data = data.transpose(1, 2).squeeze(0)
        mel_data[idx, :data.size(0)] = data
        mel_len[idx] = data.size(0)

    torch.save(mel_data, f'{root_dir}/mel_data.pt')
    torch.save(mel_len, f'{root_dir}/mel_len.pt')
Ejemplo n.º 4
0
 def test_spectrogram(self):
     specgram = transforms.Spectrogram(center=False,
                                       pad_mode="reflect",
                                       onesided=False)
     self.assertEqual(specgram.center, False)
     self.assertEqual(specgram.pad_mode, "reflect")
     self.assertEqual(specgram.onesided, False)
Ejemplo n.º 5
0
    def test_Spectrogram_complex(self):
        n_fft = 400
        hop_length = 200
        sample_rate = 16000
        waveform = get_whitenoise(
            sample_rate=sample_rate,
            n_channels=1,
        ).to(self.device, self.dtype)

        expected = librosa.core.spectrum._spectrogram(
            y=waveform[0].cpu().numpy(),
            n_fft=n_fft,
            hop_length=hop_length,
            power=1)[0]

        result = T.Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            power=None,
            return_complex=True,
        ).to(self.device, self.dtype)(waveform)[0]
        self.assertEqual(result.abs(),
                         torch.from_numpy(expected),
                         atol=1e-5,
                         rtol=1e-5)
Ejemplo n.º 6
0
    def __getitem__(self, index):
        filename = self.data_path[index]
        n_fft = 128
        #fbins = n_fft//2 + 1
        spec_transform = transforms.Spectrogram(n_fft = n_fft, normalized = False)

        label = int(filename.split("/")[-1].split("_")[0])
        soundSource = filename.split("/")[-1].split("_")[1]
        number = filename.split("/")[-1].split("_")[2]

        wave, sample_rate = torchaudio.load_wav(filename)

        spec = spec_transform(wave)

        log_spec = (spec + 1e-9).log2()[0, :, :]
        

        width = 65
        height = log_spec.shape[0]
        dim = (width, height)
        log_spec = cv2.resize(log_spec.numpy(), dim, interpolation = cv2.INTER_AREA)
        plt.figure()
        plt.imshow(log_spec)
        plt.show()
        

        return log_spec, label, soundSource
 def __init__(self, sample_rate=16000, n_fft=800, win_length=800, hop_length=200, 
              n_mels=80, rescale=True, rescaling_max=0.9, max_abs_value=4.,
              preemphasis=0.97, preemphasize=True, fmin=55, fmax=7600,
              min_level_db=-100, ref_level_db=20, symmetric_mels=True):
     super(logFbankCal, self).__init__()
     
     # these basic hyparams can be removed
     self.sample_rate = sample_rate
     self.n_fft = n_fft
     self.win_length = win_length
     self.hop_length = hop_length
     self.n_mels = n_mels
     self.fmin = fmin
     self.fmax = fmax
     
     self.rescale = rescale
     self.rescaling_max = torch.tensor(rescaling_max, dtype=torch.float)
     self.preemphasize = preemphasize
     self.flipped_filter = torch.FloatTensor([-preemphasis, 1.]).unsqueeze(0).unsqueeze(0)
     
     self.stftCal = transforms.Spectrogram(n_fft, win_length, hop_length, power=None)
     mel_basis = librosa_mel_fn(sample_rate, n_fft, n_mels, fmin, fmax)
     self.mel_basis = torch.from_numpy(mel_basis).float()
     
     self.symmetric_mels = symmetric_mels
     self.ref_level_db = torch.tensor(ref_level_db, dtype=torch.float)
     self.min_level_db = torch.tensor(min_level_db, dtype=torch.float)
     self.min_level = torch.tensor(np.exp(min_level_db / 20 * np.log(10)), dtype=torch.float)
     self.max_abs_value = torch.tensor(max_abs_value, dtype=torch.float)
Ejemplo n.º 8
0
 def test_spectrogram(self, kwargs):
     # replication_pad1d_backward_cuda is not deteministic and
     # gives very small (~2.7756e-17) difference.
     #
     # See https://github.com/pytorch/pytorch/issues/54093
     transform = T.Spectrogram(**kwargs)
     waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
     self.assert_grad(transform, [waveform], nondet_tol=1e-10)
 def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels):
     super(PhaseFbankCal, self).__init__()
     self.complexSpec = transforms.Spectrogram(n_fft=n_fft,
                                               win_length=win_length,
                                               hop_length=hop_length,
                                               power=None)
     self.mel_scale = transforms.MelScale(n_mels=n_mels,
                                          sample_rate=sample_rate,
                                          n_stft=n_fft // 2 + 1)
Ejemplo n.º 10
0
 def compute_stft(waveform: Tensor, n_fft: int, win_length: int,
                  hop_length: int) -> Tuple[Tensor, Tensor]:
     device = waveform.device
     spectrogram = _transf.Spectrogram(n_fft=n_fft,
                                       win_length=win_length,
                                       hop_length=hop_length,
                                       power=None).to(device)
     amplitude, phase = _func.magphase(spectrogram(waveform))
     return amplitude, phase
Ejemplo n.º 11
0
 def __init__(self, n_fft, hop_length):
     """Calculate spectrogram of a set of 1D signals.
     
     The first dimension is batch/channel and the second should be time.
     
     Arguments:
         n_fft {int} -- Size of time window over which to calculate each FFT.
         hop_length {int} -- The stride length between the start of each FFT window.
     """
     self.spec_fn = transforms.Spectrogram(n_fft=n_fft,
                                           hop_length=hop_length)
Ejemplo n.º 12
0
    def test_roundtrip_spectrogram(self, **args):
        """Test the spectrogram + inverse spectrogram results in approximate identity."""

        waveform = get_whitenoise(sample_rate=8000,
                                  duration=0.5,
                                  dtype=self.dtype)

        s = T.Spectrogram(**args, power=None)
        inv_s = T.InverseSpectrogram(**args)
        transformed = s.forward(waveform)
        restored = inv_s.forward(transformed, length=waveform.shape[-1])
        self.assertEqual(waveform, restored, atol=1e-6, rtol=1e-6)
Ejemplo n.º 13
0
    def __init__(self, d, src_path, batch_size, device, dropout_rate=0.5, encoder = 'ResNet'):
        super(VAE, self).__init__()
        '''
        =========== ARGUMENTS ===========
            > d - dimensionality of latent space
            > src_path - path to source samples
            > batch_size - number of training examples in single batch
            > device - CPU or GPU in use
        =================================
        '''
        if encoder == 'ResNet':
            self.enc = ResNetBigger(d=d, dropout_rate=dropout_rate)
        elif encoder == 'CNN':
            self.enc = CNN(d=d, batch_size=batch_size, dropout_rate=dropout_rate)
        elif encoder == 'linear':
            self.enc == nn.Sequential(
                nn.Linear(1025, 400),
                nn.ReLU(),
                nn.Linear(400, 100),
                nn.ReLU(),
                nn.Linear(100, 50),
                nn.ReLU(),
                nn.Linear(50, d)
            )

        self.dec = nn.Sequential(
            nn.Linear(d, 40),
            nn.ReLU(),
            nn.Linear(40, 50),
            nn.Softmax()
        )

        self.softmax = nn.Softmax(dim=0)

        # For computing spectrogram then normalizing
        self.spectrogram = T.Spectrogram(
            n_fft=2048,
            win_length=None,
            hop_length=512,
            power=2,
        )
        self.amp_to_db = T.AmplitudeToDB(stype='power')

        self.src = torch.from_numpy(np.load(src_path))
        self.src = self.src.unsqueeze(0).repeat(batch_size, 1, 1)
        self.src = self.src.to(device)

        self.d = d
        self.global_step = 0
        self.epoch = 0  
Ejemplo n.º 14
0
def _wav_to_spec(path=None, wav=None, sr=sample_rate, engine='librosa'):
    ''' STFT Spectrogram with absolute values '''
    if path is None and wav is None:
        raise ValueError

    if path is not None:
        wav, _ = librosa.core.load(path, sr=None)

    if engine == 'librosa':
        return np.abs(librosa.stft(wav, **stft_params))
    elif engine == 'torch':
        return tf.Spectrogram(**stft_params,
                              power=power)(torch.from_numpy(wav))

    raise ValueError(engine)
Ejemplo n.º 15
0
  def spectro_gram(aud, spectro_type='mel', n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    f_min, f_max, ws, top_db, pad = 0.0, None, None, 80, 0

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    if (spectro_type == 'mel'):
      spec = transforms.MelSpectrogram(sr, n_fft, ws, hop_len, f_min, f_max, pad, n_mels)(sig)
    elif (spectro_type == 'mfcc'):
      pass
    else:
      spec = transforms.Spectrogram(n_fft, ws, hop_len, pad, normalize=False)(sig)

    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)
Ejemplo n.º 16
0
def get_train_transforms(
        config: object,
        transforms_set: TformsSet = TformsSet.Audtorch) -> object:
    if config.use_mels:
        if transforms_set == TformsSet.TorchAudio:
            trans = tforms_vision.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_torch.MelSpectrogram(sample_rate=config.resampling_rate,
                                            n_fft=config.n_fft,
                                            win_length=config.hop_length,
                                            hop_length=config.hop_length,
                                            f_min=float(config.fmin),
                                            f_max=float(config.fmax),
                                            pad=0,
                                            n_mels=config.n_mels),
                tforms_torch.AmplitudeToDB(stype='power', top_db=80),
                #tforms_aud.RandomCrop(config.max_length_frames),  # Raises "Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead."
            ])
        elif transforms_set == TformsSet.MySet:  # this works
            trans = tforms_aud.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_mine.Spectrogram(config),
                tforms_aud.RandomCrop(config.max_length_frames)
            ])
    else:
        if transforms_set == TformsSet.TorchAudio:  # this works
            trans = tforms_aud.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_torch.Spectrogram(n_fft=config.n_fft,
                                         win_length=config.hop_length,
                                         hop_length=config.hop_length,
                                         pad=0,
                                         power=2,
                                         normalized=True),
                tforms_torch.AmplitudeToDB(stype='power', top_db=80),
                tforms_aud.RandomCrop(config.max_length_frames)
            ])
        elif transforms_set == TformsSet.MySet:  # this works
            trans = tforms_aud.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_mine.Spectrogram(config),
                tforms_aud.RandomCrop(config.max_length_frames)
            ])
    return trans
def get_spectrogram(
    n_fft = 400,
    win_len = None,
    hop_len = None,
    power = 2.0,
):
  waveform, _ = get_speech_sample()
  spectrogram = T.Spectrogram(
      n_fft=n_fft,
      win_length=win_len,
      hop_length=hop_len,
      center=True,
      pad_mode="reflect",
      power=power,
  )
  return spectrogram(waveform)
Ejemplo n.º 18
0
def test_ecoacoustics_dm(root: Path):
    dm = EcoacousticsDataModule(root=root,
                                segment_len=30.0,
                                target_attrs="habitat",
                                train_transforms=AT.Spectrogram())
    dm.prepare_data()
    dm.setup()

    # Test loading a sample.
    train_dl = dm.train_dataloader()
    test_sample = next(iter(train_dl))

    # Test size().
    assert test_sample.x.size()[1] == dm.dims[0]
    assert test_sample.x.size()[2] == dm.dims[1]
    assert test_sample.x.size()[3] == dm.dims[2]
Ejemplo n.º 19
0
    def __call__(self, data):
        signal = data["signal"]
        sr = data['sample_rate']

        self.n_fft = int(np.ceil(0.025 * sr))
        self.win_length = int(np.ceil(0.025 * sr))
        self.hop_length = int(np.ceil(0.01 * sr))

        spec = nn.Sequential(
            T.Spectrogram(n_fft=self.n_fft,
                          win_length=self.win_length,
                          hop_length=self.hop_length), T.AmplitudeToDB())

        data['Spectrogram'] = spec(signal)
        data['input'] = spec(signal)

        return data
Ejemplo n.º 20
0
def get_time_frequency_transform(config):
    """
    Returns a nn.Sequential block to do a time-frequency transform, and crop to the desired size.
    The spectrogram has shape: [batch, channels, freq_bins, frames]

    :param config:
    :return:
    """
    if config.use_mels:
        transformer = nn.Sequential(
            tforms_torch.MelSpectrogram(sample_rate=config.new_fs,
                                        n_fft=config.n_fft,
                                        win_length=config.win_length,
                                        hop_length=config.hop_length,
                                        f_min=float(config.fmin),
                                        f_max=float(config.fmax),
                                        pad=0,
                                        n_mels=config.n_mels),
            #utils.make_module(tforms_mine.RandomCrop)(config.max_length_frames),
            tforms_mine.RandomCrop(
                (1, config.n_mels if config.use_mels else config.n_fft // 2 +
                 1, config.max_length_frames),
                value=0),
            tforms_torch.AmplitudeToDB(stype='power', top_db=80),
            tforms_mine.ReScaleSpec([-1, 1]),
        )
    else:
        transformer = nn.Sequential(
            tforms_torch.Spectrogram(n_fft=config.n_fft,
                                     win_length=config.win_length,
                                     hop_length=config.hop_length,
                                     pad=0,
                                     power=2,
                                     normalized=True),
            tforms_mine.RandomCrop(
                (1, config.n_mels if config.use_mels else config.n_fft // 2 +
                 1, config.max_length_frames),
                value=0),
            tforms_mine.AmplitudeToDB(stype='power', top_db=80),
            #utils.make_module(tforms_mine.RandomCrop)(config.max_length_frames),
            tforms_mine.ReScaleSpec([-1, 1]),
        )

    return transformer
 def tfm_spectro(ad, sr):
     # We must reshape signal for torchaudio to generate the spectrogram.
     ws=512
     hop=256
     to_db_scale=False
     n_fft=1024
     f_min=0.0
     f_max=-80 
     pad=0
     n_mels=128
     #mel = transforms.MelSpectrogram(sr, n_mels=n_mels, n_fft=n_fft, hop=hop, f_min=f_min, f_max=f_max, pad=pad)(ad)
     sp = transforms.Spectrogram()(ad)
     mel = transforms.MelScale()(sp)
     #mel = mel.permute(0,2,1) # swap dimension, mostly to look sane to a human.
     #if to_db_scale: mel = transforms.SpectrogramToDB(stype='magnitude', top_db=f_max)(mel)
     #mel = mel.detach().numpy()
     if to_db_scale: 
         mel = 20*torch.log10(mel)
     return mel
Ejemplo n.º 22
0
    def __init__(self,
                 table_path,
                 alphabet,
                 dataset_path='',
                 max_len=0,
                 preprocess='raw',
                 normalize=False,
                 pkwargs=None):
        global transform, do_normalize

        super(SpeechDataset, self).__init__()
        self.table = pd.read_csv(table_path)
        self.dataset_path = dataset_path
        self.intencode = IntegerEncode(alphabet)
        self.max_len = max_len
        if preprocess == "mfcc":
            transform = transforms.MFCC(sample_rate=pkwargs['sr'],
                                        n_mfcc=pkwargs['num_features'])
        elif preprocess == "spectrogram":
            transform = transforms.Spectrogram(
                n_fft=pkwargs['n_fft'], normalized=pkwargs['normalized'])
Ejemplo n.º 23
0
    def test_Spectrogram(self, n_fft, hop_length, power):
        sample_rate = 16000
        waveform = get_whitenoise(
            sample_rate=sample_rate,
            n_channels=1,
        ).to(self.device, self.dtype)

        expected = librosa.core.spectrum._spectrogram(
            y=waveform[0].cpu().numpy(),
            n_fft=n_fft,
            hop_length=hop_length,
            power=power)[0]

        result = T.Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            power=power,
        ).to(self.device, self.dtype)(waveform)[0]
        self.assertEqual(result,
                         torch.from_numpy(expected),
                         atol=1e-5,
                         rtol=1e-5)
Ejemplo n.º 24
0
def compareTforms(config):
    '''
    Here I compare different transfromations sets for spectrograms, using (torchaudio, audtorch, and my own custom
    spectrogram using librosa. This codes is applied to a sample audio file from the librispeech dataset.

    This code was done mostly to post as an issue in github. As a minimal working example.
    '''
    config.use_mels = False
    config.win_length = 400
    config.hop_length = 400
    config.n_fft = 2048
    config.resampling_rate = 16000
    augment1 = tforms2.Compose([
        myTforms.ToTensor(),
        tforms.Spectrogram(
            n_fft=2048,
            win_length=400,  # 400 samples @ 16k = 25 ms,
            hop_length=400,
            pad=0,
            power=2,
            normalized=False),
        tforms.AmplitudeToDB(stype='power', top_db=80)
    ])

    augment2 = tforms2.Compose([
        tforms2.Spectrogram(
            window_size=400,  # 400 samples @ 16k = 25 ms
            hop_size=400,
            fft_size=2048),
        myTforms.ToTensor(),
        tforms.AmplitudeToDB(stype='magnitude', top_db=80)
    ])

    augment3 = tforms2.Compose([myTforms.Spectrogram(config)])

    data1 = dsets.LibriSpeech(
        root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech',
        sets='dev-clean',
        download=False,
        transform=augment1)
    data2 = dsets.LibriSpeech(
        root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech',
        sets='dev-clean',
        download=False,
        transform=augment2)
    data3 = dsets.LibriSpeech(
        root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech',
        sets='dev-clean',
        download=False,
        transform=augment3)

    plt.figure(figsize=(16, 8))

    titles = ['torchaudio', 'audtorch', 'myset']
    for i, data in enumerate([data1, data2, data3]):
        spec, label = data[0]

        if isinstance(spec, torch.Tensor):
            spec = spec.numpy()

        plt.subplot(1, 3, i + 1)
        plt.imshow(spec.squeeze(),
                   interpolation='nearest',
                   cmap='inferno',
                   origin='lower',
                   aspect='auto')
        plt.colorbar()
        plt.title(titles[i])

    plt.savefig(os.path.join('./results', 'Test_Output_compare_specs.png'))
    plt.show()
Ejemplo n.º 25
0
 def test_Spectrogram_return_complex(self):
     tensor = torch.rand((1, 1000))
     self._assert_consistency(
         T.Spectrogram(power=None, return_complex=True), tensor)
Ejemplo n.º 26
0
 def test_Spectrogram(self):
     tensor = torch.rand((1, 1000))
     self._assert_consistency(T.Spectrogram(), tensor)
Ejemplo n.º 27
0

# samples = [0, 1, 2, 2547]
# plot_samples(samples)

df_train, df_val = train_test_split(df,
                                    train_size=SPLIT_RATIO,
                                    test_size=1 - SPLIT_RATIO,
                                    random_state=RANDOM_STATE)

print(len(df_train))
print(len(df_val))
print(f'-----------------\n' \
      f'{df_train.loc[0]}')

spectrogram = T.Spectrogram(n_fft=128, hop_length=64)

classes = torch.Tensor(df.target.unique())
model = resnet18(pretrained=True)
model.conv1 = nn.Conv2d(1,
                        model.conv1.out_channels,
                        kernel_size=model.conv1.kernel_size[0],
                        stride=model.conv1.stride[0],
                        padding=model.conv1.padding[0])
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(classes))

train = df.sample(frac=1, random_state=RANDOM_STATE)
X = train.file_path
y = train.target
model.fit(torch.Tensor(np.load(X[0])), target[0])
#
# To get the frequency make-up of an audio signal as it varies with time,
# you can use ``Spectrogram``.
#

waveform, sample_rate = get_speech_sample()

n_fft = 1024
win_length = None
hop_length = 512

# define transformation
spectrogram = T.Spectrogram(
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    center=True,
    pad_mode="reflect",
    power=2.0,
)
# Perform transformation
spec = spectrogram(waveform)

print_stats(spec)
plot_spectrogram(spec[0], title='torchaudio')

######################################################################
# GriffinLim
# ----------
#
# To recover a waveform from a spectrogram, you can use ``GriffinLim``.
#
Ejemplo n.º 29
0
    val_path = r'X:\DS Training Data\samples\val.npy'
    val_data = Dataset(val_path)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             drop_last=True,
                                             **params)

    test_path = r'X:\DS Training Data\samples\test.npy'
    test_data = Dataset(test_path)
    test_loader = torch.utils.data.DataLoader(test_data,
                                              drop_last=True,
                                              **params)

    spectrogram = T.Spectrogram(
        n_fft=2048,
        win_length=None,
        hop_length=512,
        power=2,
    )
    amp_to_db = T.AmplitudeToDB(stype='power')

    src_path = test_path = r'X:\DS Training Data\samples\src.npy'

    model = VAE(d=32,
                src_path=src_path,
                batch_size=params['batch_size'],
                device=device,
                dropout_rate=0.25,
                encoder='CNN').cuda()
    print(model)

    criterion = loss_function
Ejemplo n.º 30
0
def get_train_transforms(config: object,
                         set: TformsSet = TformsSet.Audtorch) -> object:
    if config.use_mels:
        if set == TformsSet.TorchAudio:
            trans = transforms.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                tforms.MelSpectrogram(sample_rate=config.resampling_rate,
                                      n_fft=config.n_fft,
                                      win_length=config.hop_length,
                                      hop_length=config.hop_length,
                                      f_min=float(config.fmin),
                                      f_max=float(config.fmax),
                                      pad=0,
                                      n_mels=config.n_mels),
                tforms.AmplitudeToDB(stype='power', top_db=80),
                # transforms.ToPILImage(),
                # transforms.RandomCrop((96, 256), pad_if_needed=True,
                #                      padding_mode='reflect'),
                # transforms.ToTensor(),
            ])
        elif set == TformsSet.Audtorch:  ## no real mel spectrogram in audtorch
            trans = tforms2.Compose([
                myTforms.ToNumpy(),
                tforms2.Crop((441000, 441000 + 441000)),
                # tforms2.Normalize(),
                tforms2.Spectrogram(
                    window_size=config.hop_length,
                    hop_size=config.hop_length,
                    fft_size=config.n_fft,
                ),
                tforms2.Log(),
                myTforms.ToTensor(),
                tforms.AmplitudeToDB(stype='magnitude', top_db=80)
            ])
        elif set == TformsSet.MySet:
            trans = tforms2.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                myTforms.Spectrogram(config)
            ])
    else:
        if set == TformsSet.TorchAudio:
            trans = transforms.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                tforms.Spectrogram(n_fft=config.n_fft,
                                   win_length=config.hop_length,
                                   hop_length=config.hop_length,
                                   pad=0,
                                   power=2,
                                   normalized=True),
                tforms.AmplitudeToDB(stype='power', top_db=80),
                # tforms.MelSpectrogram(sample_rate=config.resampling_rate,
                #                       n_fft=config.n_fft,
                #                       win_length=config.hop_length,
                #                       hop_length=config.hop_length,
                #                       f_min=float(config.fmin),
                #                       f_max=float(config.fmax),
                #                       pad=0,
                #                       n_mels=config.n_mels),

                #transforms.ToPILImage(),
                #transforms.RandomCrop((96, 256), pad_if_needed=True,
                #                      padding_mode='reflect'),
                #transforms.ToTensor(),
            ])
        elif set == TformsSet.Audtorch:
            trans = tforms2.Compose([
                myTforms.ToNumpy(),
                tforms2.Crop((441000, 441000 + 441000)),
                #tforms2.Normalize(),
                tforms2.Spectrogram(
                    window_size=config.hop_length,
                    hop_size=config.hop_length,
                    fft_size=config.n_fft,
                ),
                myTforms.ToTensor(),
                tforms.AmplitudeToDB(stype='magnitude', top_db=80)
            ])
        elif set == TformsSet.MySet:
            trans = tforms2.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                myTforms.Spectrogram(config)
            ])
    return trans