Ejemplo n.º 1
0
    def test_AmplitudeToDB(self):
        waveform, sample_rate = torchaudio.load(self.test_filepath)

        mag_to_db_transform = transforms.AmplitudeToDB('magnitude', 80.)
        power_to_db_transform = transforms.AmplitudeToDB('power', 80.)

        mag_to_db_torch = mag_to_db_transform(torch.abs(waveform))
        power_to_db_torch = power_to_db_transform(torch.pow(waveform, 2))

        self.assertTrue(torch.allclose(mag_to_db_torch, power_to_db_torch))
Ejemplo n.º 2
0
    def test_batch_AmplitudeToDB(self):
        spec = torch.rand((6, 201))

        # Single then transform then batch
        expected = transforms.AmplitudeToDB()(spec).repeat(3, 1, 1)

        # Batch then transform
        computed = transforms.AmplitudeToDB()(spec.repeat(3, 1, 1))

        self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape))
        self.assertTrue(torch.allclose(computed, expected))
Ejemplo n.º 3
0
    def test_AmplitudeToDB(self):
        filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav')
        waveform = common_utils.load_wav(filepath)[0]

        mag_to_db_transform = transforms.AmplitudeToDB('magnitude', 80.)
        power_to_db_transform = transforms.AmplitudeToDB('power', 80.)

        mag_to_db_torch = mag_to_db_transform(torch.abs(waveform))
        power_to_db_torch = power_to_db_transform(torch.pow(waveform, 2))

        self.assertEqual(mag_to_db_torch, power_to_db_torch)
Ejemplo n.º 4
0
def get_train_transforms(
        config: object,
        transforms_set: TformsSet = TformsSet.Audtorch) -> object:
    if config.use_mels:
        if transforms_set == TformsSet.TorchAudio:
            trans = tforms_vision.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_torch.MelSpectrogram(sample_rate=config.resampling_rate,
                                            n_fft=config.n_fft,
                                            win_length=config.hop_length,
                                            hop_length=config.hop_length,
                                            f_min=float(config.fmin),
                                            f_max=float(config.fmax),
                                            pad=0,
                                            n_mels=config.n_mels),
                tforms_torch.AmplitudeToDB(stype='power', top_db=80),
                #tforms_aud.RandomCrop(config.max_length_frames),  # Raises "Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead."
            ])
        elif transforms_set == TformsSet.MySet:  # this works
            trans = tforms_aud.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_mine.Spectrogram(config),
                tforms_aud.RandomCrop(config.max_length_frames)
            ])
    else:
        if transforms_set == TformsSet.TorchAudio:  # this works
            trans = tforms_aud.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_torch.Spectrogram(n_fft=config.n_fft,
                                         win_length=config.hop_length,
                                         hop_length=config.hop_length,
                                         pad=0,
                                         power=2,
                                         normalized=True),
                tforms_torch.AmplitudeToDB(stype='power', top_db=80),
                tforms_aud.RandomCrop(config.max_length_frames)
            ])
        elif transforms_set == TformsSet.MySet:  # this works
            trans = tforms_aud.Compose([
                tforms_torch.Resample(orig_freq=44100,
                                      new_freq=config.resampling_rate),
                tforms_mine.Spectrogram(config),
                tforms_aud.RandomCrop(config.max_length_frames)
            ])
    return trans
Ejemplo n.º 5
0
def tfm_spectro(ad=None,
                sig=None,
                sr=16000,
                to_db_scale=False,
                n_fft=1024,
                ws=None,
                hop=None,
                f_min=0.0,
                f_max=-80,
                pad=0,
                n_mels=128):
    # We must reshape signal for torchaudio to generate the spectrogram.
    mel = transforms.MelSpectrogram(sample_rate=ad.sr,
                                    n_mels=n_mels,
                                    n_fft=n_fft,
                                    hop_length=hop,
                                    f_min=f_min,
                                    f_max=f_max,
                                    pad=pad)(sig.reshape(1, -1))

    mel = mel.permute(0, 2,
                      1)  # swap dimension, mostly to look sane to a human.
    if to_db_scale:
        mel = transforms.AmplitudeToDB(stype='magnitude', top_db=f_max)(mel)
    return mel
Ejemplo n.º 6
0
    def __init__(self, d, src_path, batch_size, device):
        super(VAE, self).__init__()
        '''
        =========== ARGUMENTS ===========
            > d - dimensionality of latent space
            > src_path - path to source samples
            > batch_size - number of training examples in single batch
            > device - CPU or GPU in use
        =================================
        '''
        self.enc = ResNetBigger(d=d)
        self.dec = nn.Sequential(nn.Linear(d, 40), nn.ReLU(),
                                 nn.Linear(40, 50), nn.Sigmoid())

        # For computing spectrogram then normalizing
        self.spectrogram = T.Spectrogram(
            n_fft=2048,
            win_length=None,
            hop_length=512,
            power=2,
        )
        self.amp_to_db = T.AmplitudeToDB(stype='power')

        self.src = torch.from_numpy(np.load(src_path))
        self.src = self.src.unsqueeze(0).repeat(batch_size, 1, 1)
        self.src = self.src.to(device)

        self.d = d
        self.device = device
Ejemplo n.º 7
0
 def make_mel_spectrogram(self, sig_t, framerate):
     
     # Get tensor (128 x num_samples), where 128
     # is the default number of mel bands. Can 
     # change in call to MelSpectrogram:
     mel_spec_t = transforms.MelSpectrogram(
         sample_rate=framerate,
         n_fft=self.n_fft,
         win_length=self.win_length,
         hop_length=self.hop_length
         )(sig_t)
         
     # Turn energy values to db of max energy
     # in the spectrogram:
     mel_spec_db_t = transforms.AmplitudeToDB()(mel_spec_t)
     
     (num_mel_bands, _num_timebins) = mel_spec_t.shape
     
     # Number of columns in the spectrogram:
     num_time_label_choices = DSPUtils.compute_timeticks(framerate, 
                                                         mel_spec_db_t
                                                         )
     # Enumeration of the mel bands to use as y-axis labels: 
     freq_labels = np.array(range(num_mel_bands))
     
     return(freq_labels, num_time_label_choices, mel_spec_db_t)
Ejemplo n.º 8
0
 def test_amplitude_to_db(self):
     sample_rate = 8000
     transform = T.AmplitudeToDB()
     waveform = get_whitenoise(sample_rate=sample_rate,
                               duration=0.05,
                               n_channels=2)
     self.assert_grad(transform, [waveform])
Ejemplo n.º 9
0
 def test_power_to_db(self):
     spectrogram = get_spectrogram(get_whitenoise(), n_fft=400,
                                   power=2).to(self.device, self.dtype)
     result = T.AmplitudeToDB('power', 80.).to(self.device,
                                               self.dtype)(spectrogram)[0]
     expected = librosa.core.spectrum.power_to_db(
         spectrogram[0].cpu().numpy())
     self.assertEqual(result, torch.from_numpy(expected))
Ejemplo n.º 10
0
    def __init__(self, classes_num):

        super(ResNet54, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        self.mel_spectrogram = nn.Sequential(
            AT.MelSpectrogram(sample_rate=16000,
                              n_fft=512,
                              win_length=400,
                              hop_length=160,
                              n_mels=80,
                              f_max=8000), AT.AmplitudeToDB())
        melkwargs = {
            "n_fft": 512,
            "hop_length": 160,
            "win_length": 400,
            "n_mels": 80,
            "f_max": 8000
        }
        self.mfcc = AT.MFCC(sample_rate=16000, n_mfcc=40, melkwargs=melkwargs)

        # Spectrogram extractor
        #         self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
        #             win_length=window_size, window=window, center=center, pad_mode=pad_mode,
        #             freeze_parameters=True)

        #         # Logmel feature extractor
        #         self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
        #             n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
        #             freeze_parameters=True)

        #         # Spec augmenter
        #         self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
        #             freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        # self.conv_block2 = ConvBlock(in_channels=64, out_channels=64)

        self.resnet = _ResNet(block=_ResnetBottleneck,
                              layers=[3, 4, 6, 3],
                              zero_init_residual=True)

        self.conv_block_after1 = ConvBlock(in_channels=2048, out_channels=2048)

        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        #         self.fc1 = nn.Linear(2048, 2048)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)

        self.init_weights()
Ejemplo n.º 11
0
    def spectrogram(aud, n_mels=64, n_fft=1024, hop_len=None):
        sig, sr = aud
        top_db = 80

        spec = transforms.MelSpectrogram(sr,
                                         n_fft=n_fft,
                                         hop_length=hop_len,
                                         n_mels=n_mels)(sig)
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return spec
Ejemplo n.º 12
0
    def test_mel2(self):
        top_db = 80.
        s2db = transforms.AmplitudeToDB('power', top_db)

        waveform = self.waveform.clone()  # (1, 16000)
        waveform_scaled = self.scale(waveform)  # (1, 16000)
        mel_transform = transforms.MelSpectrogram()
        # check defaults
        spectrogram_torch = s2db(
            mel_transform(waveform_scaled))  # (1, 128, 321)
        self.assertTrue(spectrogram_torch.dim() == 3)
        self.assertTrue(
            spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
        self.assertEqual(spectrogram_torch.size(1), mel_transform.n_mels)
        # check correctness of filterbank conversion matrix
        self.assertTrue(mel_transform.mel_scale.fb.sum(1).le(1.).all())
        self.assertTrue(mel_transform.mel_scale.fb.sum(1).ge(0.).all())
        # check options
        kwargs = {
            'window_fn': torch.hamming_window,
            'pad': 10,
            'win_length': 500,
            'hop_length': 125,
            'n_fft': 800,
            'n_mels': 50
        }
        mel_transform2 = transforms.MelSpectrogram(**kwargs)
        spectrogram2_torch = s2db(
            mel_transform2(waveform_scaled))  # (1, 50, 513)
        self.assertTrue(spectrogram2_torch.dim() == 3)
        self.assertTrue(
            spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
        self.assertEqual(spectrogram2_torch.size(1), mel_transform2.n_mels)
        self.assertTrue(mel_transform2.mel_scale.fb.sum(1).le(1.).all())
        self.assertTrue(mel_transform2.mel_scale.fb.sum(1).ge(0.).all())
        # check on multi-channel audio
        filepath = common_utils.get_asset_path(
            'steam-train-whistle-daniel_simon.wav')
        x_stereo = common_utils.load_wav(filepath)[0]  # (2, 278756), 44100
        spectrogram_stereo = s2db(mel_transform(x_stereo))  # (2, 128, 1394)
        self.assertTrue(spectrogram_stereo.dim() == 3)
        self.assertTrue(spectrogram_stereo.size(0) == 2)
        self.assertTrue(
            spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
        self.assertEqual(spectrogram_stereo.size(1), mel_transform.n_mels)
        # check filterbank matrix creation
        fb_matrix_transform = transforms.MelScale(n_mels=100,
                                                  sample_rate=16000,
                                                  f_min=0.,
                                                  f_max=None,
                                                  n_stft=400)
        self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.).all())
        self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.).all())
        self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
def get_mel_spectogram(file_path, n_fft, win_length, hop_length,
                       n_mels) -> torch.Tensor:
    x, sr = torchaudio.load(file_path,
                            normalization=lambda x: torch.abs(x).max())
    mel_spectrogram = nn.Sequential(
        AT.MelSpectrogram(sample_rate=sr,
                          n_fft=n_fft,
                          win_length=win_length,
                          hop_length=hop_length,
                          n_mels=n_mels), AT.AmplitudeToDB())

    return mel_spectrogram(x)
Ejemplo n.º 14
0
def get_lmfs(name: str, dsp: DSP) -> torch.Tensor:
    """Return the log mel frequency spectrogram."""
    map_path = f'{RAW_PATH}/{name}'
    mono_sig, fs = load(glob(f'{map_path}/*.mp3')[0], sr=dsp.fs, res_type='kaiser_fast')
    mono_sig = torch.from_numpy(mono_sig)
    norm_sig = normalize(mono_sig)

    mfs = transforms.MelSpectrogram(sample_rate=fs, n_fft=dsp.W,
                                    f_min=dsp.f_min, f_max=dsp.f_max,
                                    n_mels=dsp.bands, hop_length=dsp.stride,
                                    window_fn=torch.hamming_window)(norm_sig)

    lmfs = transforms.AmplitudeToDB()(mfs).unsqueeze(0).half().detach()
    return lmfs
Ejemplo n.º 15
0
 def __init__(self,
              wav_paths,
              script_paths,
              bos_id=1307,
              eos_id=1308,
              is_train=True):
     self.wav_paths = wav_paths
     self.script_paths = script_paths
     self.bos_id, self.eos_id = bos_id, eos_id
     self.is_train = is_train
     self.melspec = transforms.MelSpectrogram(sample_rate=SAMPLE_RATE,
                                              n_fft=N_FFT,
                                              n_mels=128)
     self.todb = transforms.AmplitudeToDB(stype="magnitude", top_db=80)
Ejemplo n.º 16
0
 def spectrogram_to_db(cls, spect_magnitude):
     '''
     Takes a numpy spectrogram  of magnitudes.
     Returns a numpy spectrogram containing 
     dB scaled power.
     
     @param spect_magnitude:
     @type spect_magnitude:
     '''
     transformer = transforms.AmplitudeToDB('power')
     spect_tensor = torch.Tensor(spect_magnitude)
     spect_dB_tensor = transformer.forward(spect_tensor)
     spect_dB = spect_dB_tensor.numpy()
     return spect_dB
Ejemplo n.º 17
0
    def __init__(self, d, src_path, batch_size, device, dropout_rate=0.5, encoder = 'ResNet'):
        super(VAE, self).__init__()
        '''
        =========== ARGUMENTS ===========
            > d - dimensionality of latent space
            > src_path - path to source samples
            > batch_size - number of training examples in single batch
            > device - CPU or GPU in use
        =================================
        '''
        if encoder == 'ResNet':
            self.enc = ResNetBigger(d=d, dropout_rate=dropout_rate)
        elif encoder == 'CNN':
            self.enc = CNN(d=d, batch_size=batch_size, dropout_rate=dropout_rate)
        elif encoder == 'linear':
            self.enc == nn.Sequential(
                nn.Linear(1025, 400),
                nn.ReLU(),
                nn.Linear(400, 100),
                nn.ReLU(),
                nn.Linear(100, 50),
                nn.ReLU(),
                nn.Linear(50, d)
            )

        self.dec = nn.Sequential(
            nn.Linear(d, 40),
            nn.ReLU(),
            nn.Linear(40, 50),
            nn.Softmax()
        )

        self.softmax = nn.Softmax(dim=0)

        # For computing spectrogram then normalizing
        self.spectrogram = T.Spectrogram(
            n_fft=2048,
            win_length=None,
            hop_length=512,
            power=2,
        )
        self.amp_to_db = T.AmplitudeToDB(stype='power')

        self.src = torch.from_numpy(np.load(src_path))
        self.src = self.src.unsqueeze(0).repeat(batch_size, 1, 1)
        self.src = self.src.to(device)

        self.d = d
        self.global_step = 0
        self.epoch = 0  
Ejemplo n.º 18
0
def tfm_spectro(sig, sr=32000, to_db_scale=True, n_fft=2048,
                ws=None, hop=512, f_min=20.0, f_max=160000, pad=0, n_mels=128):
    """
    img_size: 224
    melspectrogram_parameters:
      n_mels: 128
      fmin: 20
      fmax: 16000
    """
    # We must reshape signal for torchaudio to generate the spectrogram.
    mel = transforms.MelSpectrogram(sample_rate=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop, 
                                    f_min=f_min, f_max=f_max, pad=pad)(sig.reshape(1, -1))
    mel = mel.permute(0,2,1) # swap dimension, mostly to look sane to a human.
    if to_db_scale: mel = transforms.AmplitudeToDB(stype='magnitude', top_db=f_max)(mel)
    return mel
Ejemplo n.º 19
0
  def spectro_gram(aud, spectro_type='mel', n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    f_min, f_max, ws, top_db, pad = 0.0, None, None, 80, 0

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    if (spectro_type == 'mel'):
      spec = transforms.MelSpectrogram(sr, n_fft, ws, hop_len, f_min, f_max, pad, n_mels)(sig)
    elif (spectro_type == 'mfcc'):
      pass
    else:
      spec = transforms.Spectrogram(n_fft, ws, hop_len, pad, normalize=False)(sig)

    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)
Ejemplo n.º 20
0
    def __call__(self, data):
        signal = data["signal"]
        sr = data['sample_rate']

        self.n_fft = int(np.ceil(0.025 * sr))
        self.win_length = int(np.ceil(0.025 * sr))
        self.hop_length = int(np.ceil(0.01 * sr))

        spec = nn.Sequential(
            T.Spectrogram(n_fft=self.n_fft,
                          win_length=self.win_length,
                          hop_length=self.hop_length), T.AmplitudeToDB())

        data['Spectrogram'] = spec(signal)
        data['input'] = spec(signal)

        return data
Ejemplo n.º 21
0
def extract_features(x, sr):
    step = 0.01
    fft_time = 0.05
    n_mels = 128
    n_mfcc = 40
    n_fft = int(fft_time * sr)
    hop_length = int(step * sr)

    spec = AT.MelSpectrogram(sample_rate=sr,
                             n_fft=n_fft,
                             hop_length=hop_length,
                             n_mels=n_mels,
                             f_max=8000)(x)[0]
    intensity = spec.mean(dim=0).log()
    spec = AT.AmplitudeToDB()(spec)
    mfcc = AT.MFCC(
        sample_rate=sr,
        n_mfcc=n_mfcc,
        melkwargs={
            "n_fft": n_fft,
            "hop_length": hop_length,
            "n_mels": n_mels,
            "f_max": 8000,
        },
    )(x)[0]
    mfcc = (mfcc - mfcc.mean(dim=1, keepdim=True)) / mfcc.std(dim=1,
                                                              keepdim=True)
    pitch_feature = AF.compute_kaldi_pitch(
        x,
        sample_rate=sr,
        frame_length=fft_time * 1000,
        frame_shift=step * 1000,
        snip_edges=True,
        min_f0=70,
        max_f0=350,
        penalty_factor=0.01,
    )
    pitch = pitch_feature[0]
    return {
        "Waveform": x[0],
        "MelSpectrogram": spec,
        "MFCC": mfcc,
        "Pitch": pitch,
        "Intensity": intensity,
    }
Ejemplo n.º 22
0
def get_time_frequency_transform(config):
    """
    Returns a nn.Sequential block to do a time-frequency transform, and crop to the desired size.
    The spectrogram has shape: [batch, channels, freq_bins, frames]

    :param config:
    :return:
    """
    if config.use_mels:
        transformer = nn.Sequential(
            tforms_torch.MelSpectrogram(sample_rate=config.new_fs,
                                        n_fft=config.n_fft,
                                        win_length=config.win_length,
                                        hop_length=config.hop_length,
                                        f_min=float(config.fmin),
                                        f_max=float(config.fmax),
                                        pad=0,
                                        n_mels=config.n_mels),
            #utils.make_module(tforms_mine.RandomCrop)(config.max_length_frames),
            tforms_mine.RandomCrop(
                (1, config.n_mels if config.use_mels else config.n_fft // 2 +
                 1, config.max_length_frames),
                value=0),
            tforms_torch.AmplitudeToDB(stype='power', top_db=80),
            tforms_mine.ReScaleSpec([-1, 1]),
        )
    else:
        transformer = nn.Sequential(
            tforms_torch.Spectrogram(n_fft=config.n_fft,
                                     win_length=config.win_length,
                                     hop_length=config.hop_length,
                                     pad=0,
                                     power=2,
                                     normalized=True),
            tforms_mine.RandomCrop(
                (1, config.n_mels if config.use_mels else config.n_fft // 2 +
                 1, config.max_length_frames),
                value=0),
            tforms_mine.AmplitudeToDB(stype='power', top_db=80),
            #utils.make_module(tforms_mine.RandomCrop)(config.max_length_frames),
            tforms_mine.ReScaleSpec([-1, 1]),
        )

    return transformer
Ejemplo n.º 23
0
 def __getitem__(self, index):
     audio, sr = load(self.file_paths[index])
     audio = torch.mean(audio, dim=0, keepdim=True)
     if self.sr != sr:
         audio = transforms.Resample(sr, self.sr)(audio)
     mel_spectrogram = transforms.MelSpectrogram(sample_rate=self.sr,
                                                 n_fft=self.n_fft,
                                                 win_length=self.win_length,
                                                 hop_length=self.hop_length,
                                                 n_mels=self.n_mels,
                                                 f_max=self.sr / 2)(audio)
     if self.log_mel:
         offset = 1e-6
         mel_spectrogram = torch.log(mel_spectrogram + offset)
     else:
         mel_spectrogram = transforms.AmplitudeToDB(
             stype="power", top_db=80)(mel_spectrogram)
     if self.augment:
         audio = transforms.FrequencyMasking(freq_mask_param=20)(audio)
         audio = transforms.TimeMasking(time_mask_param=10)(audio)
     label = self.labels[index]
     return mel_spectrogram, label
Ejemplo n.º 24
0
def Mel_spectroize(train_feature_path, train_label_path):
    x_data = sorted(glob(train_feature_path))
    x_data = data_loader(x_data)
    y_data = pd.read_csv(train_label_path, index_col=0)
    y_data = y_data.values

    mel_spectrogram = nn.Sequential(
        AT.MelSpectrogram(sample_rate=16000,
                          n_fft=512,
                          win_length=400,
                          hop_length=160,
                          n_mels=80),
        AT.AmplitudeToDB()
    )

    mel0 = mel_spectrogram(torch.tensor(x_data[0])).view(1, 1, 80, 101)
    mel1 = mel_spectrogram(torch.tensor(x_data[1])).view(1, 1, 80, 101)
    mel = torch.cat((mel0, mel1), 0)
    for i in range(2, 100000):
        if i % 100 == 0:
            print("Mel spectrogram progress: {}%".format(i/100000*100))
        mel_temp = mel_spectrogram(torch.tensor(x_data[i])).view(1, 1, 80, 101)
        mel = torch.cat((mel, mel_temp), 0)
    return mel, y_data
Ejemplo n.º 25
0
def get_train_transforms(config: object,
                         set: TformsSet = TformsSet.Audtorch) -> object:
    if config.use_mels:
        if set == TformsSet.TorchAudio:
            trans = transforms.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                tforms.MelSpectrogram(sample_rate=config.resampling_rate,
                                      n_fft=config.n_fft,
                                      win_length=config.hop_length,
                                      hop_length=config.hop_length,
                                      f_min=float(config.fmin),
                                      f_max=float(config.fmax),
                                      pad=0,
                                      n_mels=config.n_mels),
                tforms.AmplitudeToDB(stype='power', top_db=80),
                # transforms.ToPILImage(),
                # transforms.RandomCrop((96, 256), pad_if_needed=True,
                #                      padding_mode='reflect'),
                # transforms.ToTensor(),
            ])
        elif set == TformsSet.Audtorch:  ## no real mel spectrogram in audtorch
            trans = tforms2.Compose([
                myTforms.ToNumpy(),
                tforms2.Crop((441000, 441000 + 441000)),
                # tforms2.Normalize(),
                tforms2.Spectrogram(
                    window_size=config.hop_length,
                    hop_size=config.hop_length,
                    fft_size=config.n_fft,
                ),
                tforms2.Log(),
                myTforms.ToTensor(),
                tforms.AmplitudeToDB(stype='magnitude', top_db=80)
            ])
        elif set == TformsSet.MySet:
            trans = tforms2.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                myTforms.Spectrogram(config)
            ])
    else:
        if set == TformsSet.TorchAudio:
            trans = transforms.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                tforms.Spectrogram(n_fft=config.n_fft,
                                   win_length=config.hop_length,
                                   hop_length=config.hop_length,
                                   pad=0,
                                   power=2,
                                   normalized=True),
                tforms.AmplitudeToDB(stype='power', top_db=80),
                # tforms.MelSpectrogram(sample_rate=config.resampling_rate,
                #                       n_fft=config.n_fft,
                #                       win_length=config.hop_length,
                #                       hop_length=config.hop_length,
                #                       f_min=float(config.fmin),
                #                       f_max=float(config.fmax),
                #                       pad=0,
                #                       n_mels=config.n_mels),

                #transforms.ToPILImage(),
                #transforms.RandomCrop((96, 256), pad_if_needed=True,
                #                      padding_mode='reflect'),
                #transforms.ToTensor(),
            ])
        elif set == TformsSet.Audtorch:
            trans = tforms2.Compose([
                myTforms.ToNumpy(),
                tforms2.Crop((441000, 441000 + 441000)),
                #tforms2.Normalize(),
                tforms2.Spectrogram(
                    window_size=config.hop_length,
                    hop_size=config.hop_length,
                    fft_size=config.n_fft,
                ),
                myTforms.ToTensor(),
                tforms.AmplitudeToDB(stype='magnitude', top_db=80)
            ])
        elif set == TformsSet.MySet:
            trans = tforms2.Compose([
                tforms2.Crop((441000, 441000 + 441000)),
                myTforms.Spectrogram(config)
            ])
    return trans
Ejemplo n.º 26
0
 def get_spectrogram(amplitude: Tensor, top_db: float = 80) -> Tensor:
     device = amplitude.device
     amplitude_to_db = _transf.AmplitudeToDB(top_db=top_db).to(device)
     return amplitude_to_db(amplitude.pow(2))
Ejemplo n.º 27
0
 def test_AmplitudeToDB(self):
     spec = torch.rand((6, 201))
     self._assert_consistency(T.AmplitudeToDB(), spec)
Ejemplo n.º 28
0
                    default=None,
                    help='Optimizer path')
parser.add_argument('--x',
                    type=int,
                    default=1,
                    help='Number of AE layers to use')
parser.add_argument('--batch_size', type=int, default=64, help='Batch size')
parser.add_argument('--learn_rate',
                    type=float,
                    default=0.001,
                    help='Learning rate')

args = parser.parse_args()

transform = tvt.Compose([
    transforms.AmplitudeToDB(stype='power',
                             top_db=None),  # it won't square the input
    tvt.Normalize(mean=[-38.39992], std=[13.462255])
])


class MyIterableDataset(torch.utils.data.Dataset):
    def __init__(self, audios, transform):
        super(MyIterableDataset).__init__()
        self.all_audios = audios  # [os.path.join(path, f) for f in os.listdir(path)]
        self.start = 0
        self.end = len(self.all_audios)
        self.transform = transform

    def __getitem__(self, index):
        output, _ = torchaudio.load(self.all_audios[index], normalization=True)
Ejemplo n.º 29
0
                                             drop_last=True,
                                             **params)

    test_path = r'X:\DS Training Data\samples\test.npy'
    test_data = Dataset(test_path)
    test_loader = torch.utils.data.DataLoader(test_data,
                                              drop_last=True,
                                              **params)

    spectrogram = T.Spectrogram(
        n_fft=2048,
        win_length=None,
        hop_length=512,
        power=2,
    )
    amp_to_db = T.AmplitudeToDB(stype='power')

    src_path = test_path = r'X:\DS Training Data\samples\src.npy'

    model = VAE(d=32,
                src_path=src_path,
                batch_size=params['batch_size'],
                device=device,
                dropout_rate=0.25,
                encoder='CNN').cuda()
    print(model)

    criterion = loss_function

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
Ejemplo n.º 30
0
def compareTforms(config):
    '''
    Here I compare different transfromations sets for spectrograms, using (torchaudio, audtorch, and my own custom
    spectrogram using librosa. This codes is applied to a sample audio file from the librispeech dataset.

    This code was done mostly to post as an issue in github. As a minimal working example.
    '''
    config.use_mels = False
    config.win_length = 400
    config.hop_length = 400
    config.n_fft = 2048
    config.resampling_rate = 16000
    augment1 = tforms2.Compose([
        myTforms.ToTensor(),
        tforms.Spectrogram(
            n_fft=2048,
            win_length=400,  # 400 samples @ 16k = 25 ms,
            hop_length=400,
            pad=0,
            power=2,
            normalized=False),
        tforms.AmplitudeToDB(stype='power', top_db=80)
    ])

    augment2 = tforms2.Compose([
        tforms2.Spectrogram(
            window_size=400,  # 400 samples @ 16k = 25 ms
            hop_size=400,
            fft_size=2048),
        myTforms.ToTensor(),
        tforms.AmplitudeToDB(stype='magnitude', top_db=80)
    ])

    augment3 = tforms2.Compose([myTforms.Spectrogram(config)])

    data1 = dsets.LibriSpeech(
        root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech',
        sets='dev-clean',
        download=False,
        transform=augment1)
    data2 = dsets.LibriSpeech(
        root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech',
        sets='dev-clean',
        download=False,
        transform=augment2)
    data3 = dsets.LibriSpeech(
        root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech',
        sets='dev-clean',
        download=False,
        transform=augment3)

    plt.figure(figsize=(16, 8))

    titles = ['torchaudio', 'audtorch', 'myset']
    for i, data in enumerate([data1, data2, data3]):
        spec, label = data[0]

        if isinstance(spec, torch.Tensor):
            spec = spec.numpy()

        plt.subplot(1, 3, i + 1)
        plt.imshow(spec.squeeze(),
                   interpolation='nearest',
                   cmap='inferno',
                   origin='lower',
                   aspect='auto')
        plt.colorbar()
        plt.title(titles[i])

    plt.savefig(os.path.join('./results', 'Test_Output_compare_specs.png'))
    plt.show()