def test_AmplitudeToDB(self): waveform, sample_rate = torchaudio.load(self.test_filepath) mag_to_db_transform = transforms.AmplitudeToDB('magnitude', 80.) power_to_db_transform = transforms.AmplitudeToDB('power', 80.) mag_to_db_torch = mag_to_db_transform(torch.abs(waveform)) power_to_db_torch = power_to_db_transform(torch.pow(waveform, 2)) self.assertTrue(torch.allclose(mag_to_db_torch, power_to_db_torch))
def test_batch_AmplitudeToDB(self): spec = torch.rand((6, 201)) # Single then transform then batch expected = transforms.AmplitudeToDB()(spec).repeat(3, 1, 1) # Batch then transform computed = transforms.AmplitudeToDB()(spec.repeat(3, 1, 1)) self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape)) self.assertTrue(torch.allclose(computed, expected))
def test_AmplitudeToDB(self): filepath = common_utils.get_asset_path('steam-train-whistle-daniel_simon.wav') waveform = common_utils.load_wav(filepath)[0] mag_to_db_transform = transforms.AmplitudeToDB('magnitude', 80.) power_to_db_transform = transforms.AmplitudeToDB('power', 80.) mag_to_db_torch = mag_to_db_transform(torch.abs(waveform)) power_to_db_torch = power_to_db_transform(torch.pow(waveform, 2)) self.assertEqual(mag_to_db_torch, power_to_db_torch)
def get_train_transforms( config: object, transforms_set: TformsSet = TformsSet.Audtorch) -> object: if config.use_mels: if transforms_set == TformsSet.TorchAudio: trans = tforms_vision.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_torch.MelSpectrogram(sample_rate=config.resampling_rate, n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, f_min=float(config.fmin), f_max=float(config.fmax), pad=0, n_mels=config.n_mels), tforms_torch.AmplitudeToDB(stype='power', top_db=80), #tforms_aud.RandomCrop(config.max_length_frames), # Raises "Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead." ]) elif transforms_set == TformsSet.MySet: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_mine.Spectrogram(config), tforms_aud.RandomCrop(config.max_length_frames) ]) else: if transforms_set == TformsSet.TorchAudio: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_torch.Spectrogram(n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, pad=0, power=2, normalized=True), tforms_torch.AmplitudeToDB(stype='power', top_db=80), tforms_aud.RandomCrop(config.max_length_frames) ]) elif transforms_set == TformsSet.MySet: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_mine.Spectrogram(config), tforms_aud.RandomCrop(config.max_length_frames) ]) return trans
def tfm_spectro(ad=None, sig=None, sr=16000, to_db_scale=False, n_fft=1024, ws=None, hop=None, f_min=0.0, f_max=-80, pad=0, n_mels=128): # We must reshape signal for torchaudio to generate the spectrogram. mel = transforms.MelSpectrogram(sample_rate=ad.sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop, f_min=f_min, f_max=f_max, pad=pad)(sig.reshape(1, -1)) mel = mel.permute(0, 2, 1) # swap dimension, mostly to look sane to a human. if to_db_scale: mel = transforms.AmplitudeToDB(stype='magnitude', top_db=f_max)(mel) return mel
def __init__(self, d, src_path, batch_size, device): super(VAE, self).__init__() ''' =========== ARGUMENTS =========== > d - dimensionality of latent space > src_path - path to source samples > batch_size - number of training examples in single batch > device - CPU or GPU in use ================================= ''' self.enc = ResNetBigger(d=d) self.dec = nn.Sequential(nn.Linear(d, 40), nn.ReLU(), nn.Linear(40, 50), nn.Sigmoid()) # For computing spectrogram then normalizing self.spectrogram = T.Spectrogram( n_fft=2048, win_length=None, hop_length=512, power=2, ) self.amp_to_db = T.AmplitudeToDB(stype='power') self.src = torch.from_numpy(np.load(src_path)) self.src = self.src.unsqueeze(0).repeat(batch_size, 1, 1) self.src = self.src.to(device) self.d = d self.device = device
def make_mel_spectrogram(self, sig_t, framerate): # Get tensor (128 x num_samples), where 128 # is the default number of mel bands. Can # change in call to MelSpectrogram: mel_spec_t = transforms.MelSpectrogram( sample_rate=framerate, n_fft=self.n_fft, win_length=self.win_length, hop_length=self.hop_length )(sig_t) # Turn energy values to db of max energy # in the spectrogram: mel_spec_db_t = transforms.AmplitudeToDB()(mel_spec_t) (num_mel_bands, _num_timebins) = mel_spec_t.shape # Number of columns in the spectrogram: num_time_label_choices = DSPUtils.compute_timeticks(framerate, mel_spec_db_t ) # Enumeration of the mel bands to use as y-axis labels: freq_labels = np.array(range(num_mel_bands)) return(freq_labels, num_time_label_choices, mel_spec_db_t)
def test_amplitude_to_db(self): sample_rate = 8000 transform = T.AmplitudeToDB() waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2) self.assert_grad(transform, [waveform])
def test_power_to_db(self): spectrogram = get_spectrogram(get_whitenoise(), n_fft=400, power=2).to(self.device, self.dtype) result = T.AmplitudeToDB('power', 80.).to(self.device, self.dtype)(spectrogram)[0] expected = librosa.core.spectrum.power_to_db( spectrogram[0].cpu().numpy()) self.assertEqual(result, torch.from_numpy(expected))
def __init__(self, classes_num): super(ResNet54, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None self.mel_spectrogram = nn.Sequential( AT.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, n_mels=80, f_max=8000), AT.AmplitudeToDB()) melkwargs = { "n_fft": 512, "hop_length": 160, "win_length": 400, "n_mels": 80, "f_max": 8000 } self.mfcc = AT.MFCC(sample_rate=16000, n_mfcc=40, melkwargs=melkwargs) # Spectrogram extractor # self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, # win_length=window_size, window=window, center=center, pad_mode=pad_mode, # freeze_parameters=True) # # Logmel feature extractor # self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, # n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, # freeze_parameters=True) # # Spec augmenter # self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, # freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) # self.conv_block2 = ConvBlock(in_channels=64, out_channels=64) self.resnet = _ResNet(block=_ResnetBottleneck, layers=[3, 4, 6, 3], zero_init_residual=True) self.conv_block_after1 = ConvBlock(in_channels=2048, out_channels=2048) self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) # self.fc1 = nn.Linear(2048, 2048) self.fc_audioset = nn.Linear(2048, classes_num, bias=True) self.init_weights()
def spectrogram(aud, n_mels=64, n_fft=1024, hop_len=None): sig, sr = aud top_db = 80 spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig) spec = transforms.AmplitudeToDB(top_db=top_db)(spec) return spec
def test_mel2(self): top_db = 80. s2db = transforms.AmplitudeToDB('power', top_db) waveform = self.waveform.clone() # (1, 16000) waveform_scaled = self.scale(waveform) # (1, 16000) mel_transform = transforms.MelSpectrogram() # check defaults spectrogram_torch = s2db( mel_transform(waveform_scaled)) # (1, 128, 321) self.assertTrue(spectrogram_torch.dim() == 3) self.assertTrue( spectrogram_torch.ge(spectrogram_torch.max() - top_db).all()) self.assertEqual(spectrogram_torch.size(1), mel_transform.n_mels) # check correctness of filterbank conversion matrix self.assertTrue(mel_transform.mel_scale.fb.sum(1).le(1.).all()) self.assertTrue(mel_transform.mel_scale.fb.sum(1).ge(0.).all()) # check options kwargs = { 'window_fn': torch.hamming_window, 'pad': 10, 'win_length': 500, 'hop_length': 125, 'n_fft': 800, 'n_mels': 50 } mel_transform2 = transforms.MelSpectrogram(**kwargs) spectrogram2_torch = s2db( mel_transform2(waveform_scaled)) # (1, 50, 513) self.assertTrue(spectrogram2_torch.dim() == 3) self.assertTrue( spectrogram_torch.ge(spectrogram_torch.max() - top_db).all()) self.assertEqual(spectrogram2_torch.size(1), mel_transform2.n_mels) self.assertTrue(mel_transform2.mel_scale.fb.sum(1).le(1.).all()) self.assertTrue(mel_transform2.mel_scale.fb.sum(1).ge(0.).all()) # check on multi-channel audio filepath = common_utils.get_asset_path( 'steam-train-whistle-daniel_simon.wav') x_stereo = common_utils.load_wav(filepath)[0] # (2, 278756), 44100 spectrogram_stereo = s2db(mel_transform(x_stereo)) # (2, 128, 1394) self.assertTrue(spectrogram_stereo.dim() == 3) self.assertTrue(spectrogram_stereo.size(0) == 2) self.assertTrue( spectrogram_torch.ge(spectrogram_torch.max() - top_db).all()) self.assertEqual(spectrogram_stereo.size(1), mel_transform.n_mels) # check filterbank matrix creation fb_matrix_transform = transforms.MelScale(n_mels=100, sample_rate=16000, f_min=0., f_max=None, n_stft=400) self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.).all()) self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.).all()) self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
def get_mel_spectogram(file_path, n_fft, win_length, hop_length, n_mels) -> torch.Tensor: x, sr = torchaudio.load(file_path, normalization=lambda x: torch.abs(x).max()) mel_spectrogram = nn.Sequential( AT.MelSpectrogram(sample_rate=sr, n_fft=n_fft, win_length=win_length, hop_length=hop_length, n_mels=n_mels), AT.AmplitudeToDB()) return mel_spectrogram(x)
def get_lmfs(name: str, dsp: DSP) -> torch.Tensor: """Return the log mel frequency spectrogram.""" map_path = f'{RAW_PATH}/{name}' mono_sig, fs = load(glob(f'{map_path}/*.mp3')[0], sr=dsp.fs, res_type='kaiser_fast') mono_sig = torch.from_numpy(mono_sig) norm_sig = normalize(mono_sig) mfs = transforms.MelSpectrogram(sample_rate=fs, n_fft=dsp.W, f_min=dsp.f_min, f_max=dsp.f_max, n_mels=dsp.bands, hop_length=dsp.stride, window_fn=torch.hamming_window)(norm_sig) lmfs = transforms.AmplitudeToDB()(mfs).unsqueeze(0).half().detach() return lmfs
def __init__(self, wav_paths, script_paths, bos_id=1307, eos_id=1308, is_train=True): self.wav_paths = wav_paths self.script_paths = script_paths self.bos_id, self.eos_id = bos_id, eos_id self.is_train = is_train self.melspec = transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_fft=N_FFT, n_mels=128) self.todb = transforms.AmplitudeToDB(stype="magnitude", top_db=80)
def spectrogram_to_db(cls, spect_magnitude): ''' Takes a numpy spectrogram of magnitudes. Returns a numpy spectrogram containing dB scaled power. @param spect_magnitude: @type spect_magnitude: ''' transformer = transforms.AmplitudeToDB('power') spect_tensor = torch.Tensor(spect_magnitude) spect_dB_tensor = transformer.forward(spect_tensor) spect_dB = spect_dB_tensor.numpy() return spect_dB
def __init__(self, d, src_path, batch_size, device, dropout_rate=0.5, encoder = 'ResNet'): super(VAE, self).__init__() ''' =========== ARGUMENTS =========== > d - dimensionality of latent space > src_path - path to source samples > batch_size - number of training examples in single batch > device - CPU or GPU in use ================================= ''' if encoder == 'ResNet': self.enc = ResNetBigger(d=d, dropout_rate=dropout_rate) elif encoder == 'CNN': self.enc = CNN(d=d, batch_size=batch_size, dropout_rate=dropout_rate) elif encoder == 'linear': self.enc == nn.Sequential( nn.Linear(1025, 400), nn.ReLU(), nn.Linear(400, 100), nn.ReLU(), nn.Linear(100, 50), nn.ReLU(), nn.Linear(50, d) ) self.dec = nn.Sequential( nn.Linear(d, 40), nn.ReLU(), nn.Linear(40, 50), nn.Softmax() ) self.softmax = nn.Softmax(dim=0) # For computing spectrogram then normalizing self.spectrogram = T.Spectrogram( n_fft=2048, win_length=None, hop_length=512, power=2, ) self.amp_to_db = T.AmplitudeToDB(stype='power') self.src = torch.from_numpy(np.load(src_path)) self.src = self.src.unsqueeze(0).repeat(batch_size, 1, 1) self.src = self.src.to(device) self.d = d self.global_step = 0 self.epoch = 0
def tfm_spectro(sig, sr=32000, to_db_scale=True, n_fft=2048, ws=None, hop=512, f_min=20.0, f_max=160000, pad=0, n_mels=128): """ img_size: 224 melspectrogram_parameters: n_mels: 128 fmin: 20 fmax: 16000 """ # We must reshape signal for torchaudio to generate the spectrogram. mel = transforms.MelSpectrogram(sample_rate=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop, f_min=f_min, f_max=f_max, pad=pad)(sig.reshape(1, -1)) mel = mel.permute(0,2,1) # swap dimension, mostly to look sane to a human. if to_db_scale: mel = transforms.AmplitudeToDB(stype='magnitude', top_db=f_max)(mel) return mel
def spectro_gram(aud, spectro_type='mel', n_mels=64, n_fft=1024, hop_len=None): sig,sr = aud f_min, f_max, ws, top_db, pad = 0.0, None, None, 80, 0 # spec has shape [channel, n_mels, time], where channel is mono, stereo etc if (spectro_type == 'mel'): spec = transforms.MelSpectrogram(sr, n_fft, ws, hop_len, f_min, f_max, pad, n_mels)(sig) elif (spectro_type == 'mfcc'): pass else: spec = transforms.Spectrogram(n_fft, ws, hop_len, pad, normalize=False)(sig) # Convert to decibels spec = transforms.AmplitudeToDB(top_db=top_db)(spec) return (spec)
def __call__(self, data): signal = data["signal"] sr = data['sample_rate'] self.n_fft = int(np.ceil(0.025 * sr)) self.win_length = int(np.ceil(0.025 * sr)) self.hop_length = int(np.ceil(0.01 * sr)) spec = nn.Sequential( T.Spectrogram(n_fft=self.n_fft, win_length=self.win_length, hop_length=self.hop_length), T.AmplitudeToDB()) data['Spectrogram'] = spec(signal) data['input'] = spec(signal) return data
def extract_features(x, sr): step = 0.01 fft_time = 0.05 n_mels = 128 n_mfcc = 40 n_fft = int(fft_time * sr) hop_length = int(step * sr) spec = AT.MelSpectrogram(sample_rate=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, f_max=8000)(x)[0] intensity = spec.mean(dim=0).log() spec = AT.AmplitudeToDB()(spec) mfcc = AT.MFCC( sample_rate=sr, n_mfcc=n_mfcc, melkwargs={ "n_fft": n_fft, "hop_length": hop_length, "n_mels": n_mels, "f_max": 8000, }, )(x)[0] mfcc = (mfcc - mfcc.mean(dim=1, keepdim=True)) / mfcc.std(dim=1, keepdim=True) pitch_feature = AF.compute_kaldi_pitch( x, sample_rate=sr, frame_length=fft_time * 1000, frame_shift=step * 1000, snip_edges=True, min_f0=70, max_f0=350, penalty_factor=0.01, ) pitch = pitch_feature[0] return { "Waveform": x[0], "MelSpectrogram": spec, "MFCC": mfcc, "Pitch": pitch, "Intensity": intensity, }
def get_time_frequency_transform(config): """ Returns a nn.Sequential block to do a time-frequency transform, and crop to the desired size. The spectrogram has shape: [batch, channels, freq_bins, frames] :param config: :return: """ if config.use_mels: transformer = nn.Sequential( tforms_torch.MelSpectrogram(sample_rate=config.new_fs, n_fft=config.n_fft, win_length=config.win_length, hop_length=config.hop_length, f_min=float(config.fmin), f_max=float(config.fmax), pad=0, n_mels=config.n_mels), #utils.make_module(tforms_mine.RandomCrop)(config.max_length_frames), tforms_mine.RandomCrop( (1, config.n_mels if config.use_mels else config.n_fft // 2 + 1, config.max_length_frames), value=0), tforms_torch.AmplitudeToDB(stype='power', top_db=80), tforms_mine.ReScaleSpec([-1, 1]), ) else: transformer = nn.Sequential( tforms_torch.Spectrogram(n_fft=config.n_fft, win_length=config.win_length, hop_length=config.hop_length, pad=0, power=2, normalized=True), tforms_mine.RandomCrop( (1, config.n_mels if config.use_mels else config.n_fft // 2 + 1, config.max_length_frames), value=0), tforms_mine.AmplitudeToDB(stype='power', top_db=80), #utils.make_module(tforms_mine.RandomCrop)(config.max_length_frames), tforms_mine.ReScaleSpec([-1, 1]), ) return transformer
def __getitem__(self, index): audio, sr = load(self.file_paths[index]) audio = torch.mean(audio, dim=0, keepdim=True) if self.sr != sr: audio = transforms.Resample(sr, self.sr)(audio) mel_spectrogram = transforms.MelSpectrogram(sample_rate=self.sr, n_fft=self.n_fft, win_length=self.win_length, hop_length=self.hop_length, n_mels=self.n_mels, f_max=self.sr / 2)(audio) if self.log_mel: offset = 1e-6 mel_spectrogram = torch.log(mel_spectrogram + offset) else: mel_spectrogram = transforms.AmplitudeToDB( stype="power", top_db=80)(mel_spectrogram) if self.augment: audio = transforms.FrequencyMasking(freq_mask_param=20)(audio) audio = transforms.TimeMasking(time_mask_param=10)(audio) label = self.labels[index] return mel_spectrogram, label
def Mel_spectroize(train_feature_path, train_label_path): x_data = sorted(glob(train_feature_path)) x_data = data_loader(x_data) y_data = pd.read_csv(train_label_path, index_col=0) y_data = y_data.values mel_spectrogram = nn.Sequential( AT.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, n_mels=80), AT.AmplitudeToDB() ) mel0 = mel_spectrogram(torch.tensor(x_data[0])).view(1, 1, 80, 101) mel1 = mel_spectrogram(torch.tensor(x_data[1])).view(1, 1, 80, 101) mel = torch.cat((mel0, mel1), 0) for i in range(2, 100000): if i % 100 == 0: print("Mel spectrogram progress: {}%".format(i/100000*100)) mel_temp = mel_spectrogram(torch.tensor(x_data[i])).view(1, 1, 80, 101) mel = torch.cat((mel, mel_temp), 0) return mel, y_data
def get_train_transforms(config: object, set: TformsSet = TformsSet.Audtorch) -> object: if config.use_mels: if set == TformsSet.TorchAudio: trans = transforms.Compose([ tforms2.Crop((441000, 441000 + 441000)), tforms.MelSpectrogram(sample_rate=config.resampling_rate, n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, f_min=float(config.fmin), f_max=float(config.fmax), pad=0, n_mels=config.n_mels), tforms.AmplitudeToDB(stype='power', top_db=80), # transforms.ToPILImage(), # transforms.RandomCrop((96, 256), pad_if_needed=True, # padding_mode='reflect'), # transforms.ToTensor(), ]) elif set == TformsSet.Audtorch: ## no real mel spectrogram in audtorch trans = tforms2.Compose([ myTforms.ToNumpy(), tforms2.Crop((441000, 441000 + 441000)), # tforms2.Normalize(), tforms2.Spectrogram( window_size=config.hop_length, hop_size=config.hop_length, fft_size=config.n_fft, ), tforms2.Log(), myTforms.ToTensor(), tforms.AmplitudeToDB(stype='magnitude', top_db=80) ]) elif set == TformsSet.MySet: trans = tforms2.Compose([ tforms2.Crop((441000, 441000 + 441000)), myTforms.Spectrogram(config) ]) else: if set == TformsSet.TorchAudio: trans = transforms.Compose([ tforms2.Crop((441000, 441000 + 441000)), tforms.Spectrogram(n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, pad=0, power=2, normalized=True), tforms.AmplitudeToDB(stype='power', top_db=80), # tforms.MelSpectrogram(sample_rate=config.resampling_rate, # n_fft=config.n_fft, # win_length=config.hop_length, # hop_length=config.hop_length, # f_min=float(config.fmin), # f_max=float(config.fmax), # pad=0, # n_mels=config.n_mels), #transforms.ToPILImage(), #transforms.RandomCrop((96, 256), pad_if_needed=True, # padding_mode='reflect'), #transforms.ToTensor(), ]) elif set == TformsSet.Audtorch: trans = tforms2.Compose([ myTforms.ToNumpy(), tforms2.Crop((441000, 441000 + 441000)), #tforms2.Normalize(), tforms2.Spectrogram( window_size=config.hop_length, hop_size=config.hop_length, fft_size=config.n_fft, ), myTforms.ToTensor(), tforms.AmplitudeToDB(stype='magnitude', top_db=80) ]) elif set == TformsSet.MySet: trans = tforms2.Compose([ tforms2.Crop((441000, 441000 + 441000)), myTforms.Spectrogram(config) ]) return trans
def get_spectrogram(amplitude: Tensor, top_db: float = 80) -> Tensor: device = amplitude.device amplitude_to_db = _transf.AmplitudeToDB(top_db=top_db).to(device) return amplitude_to_db(amplitude.pow(2))
def test_AmplitudeToDB(self): spec = torch.rand((6, 201)) self._assert_consistency(T.AmplitudeToDB(), spec)
default=None, help='Optimizer path') parser.add_argument('--x', type=int, default=1, help='Number of AE layers to use') parser.add_argument('--batch_size', type=int, default=64, help='Batch size') parser.add_argument('--learn_rate', type=float, default=0.001, help='Learning rate') args = parser.parse_args() transform = tvt.Compose([ transforms.AmplitudeToDB(stype='power', top_db=None), # it won't square the input tvt.Normalize(mean=[-38.39992], std=[13.462255]) ]) class MyIterableDataset(torch.utils.data.Dataset): def __init__(self, audios, transform): super(MyIterableDataset).__init__() self.all_audios = audios # [os.path.join(path, f) for f in os.listdir(path)] self.start = 0 self.end = len(self.all_audios) self.transform = transform def __getitem__(self, index): output, _ = torchaudio.load(self.all_audios[index], normalization=True)
drop_last=True, **params) test_path = r'X:\DS Training Data\samples\test.npy' test_data = Dataset(test_path) test_loader = torch.utils.data.DataLoader(test_data, drop_last=True, **params) spectrogram = T.Spectrogram( n_fft=2048, win_length=None, hop_length=512, power=2, ) amp_to_db = T.AmplitudeToDB(stype='power') src_path = test_path = r'X:\DS Training Data\samples\src.npy' model = VAE(d=32, src_path=src_path, batch_size=params['batch_size'], device=device, dropout_rate=0.25, encoder='CNN').cuda() print(model) criterion = loss_function optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
def compareTforms(config): ''' Here I compare different transfromations sets for spectrograms, using (torchaudio, audtorch, and my own custom spectrogram using librosa. This codes is applied to a sample audio file from the librispeech dataset. This code was done mostly to post as an issue in github. As a minimal working example. ''' config.use_mels = False config.win_length = 400 config.hop_length = 400 config.n_fft = 2048 config.resampling_rate = 16000 augment1 = tforms2.Compose([ myTforms.ToTensor(), tforms.Spectrogram( n_fft=2048, win_length=400, # 400 samples @ 16k = 25 ms, hop_length=400, pad=0, power=2, normalized=False), tforms.AmplitudeToDB(stype='power', top_db=80) ]) augment2 = tforms2.Compose([ tforms2.Spectrogram( window_size=400, # 400 samples @ 16k = 25 ms hop_size=400, fft_size=2048), myTforms.ToTensor(), tforms.AmplitudeToDB(stype='magnitude', top_db=80) ]) augment3 = tforms2.Compose([myTforms.Spectrogram(config)]) data1 = dsets.LibriSpeech( root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech', sets='dev-clean', download=False, transform=augment1) data2 = dsets.LibriSpeech( root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech', sets='dev-clean', download=False, transform=augment2) data3 = dsets.LibriSpeech( root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech', sets='dev-clean', download=False, transform=augment3) plt.figure(figsize=(16, 8)) titles = ['torchaudio', 'audtorch', 'myset'] for i, data in enumerate([data1, data2, data3]): spec, label = data[0] if isinstance(spec, torch.Tensor): spec = spec.numpy() plt.subplot(1, 3, i + 1) plt.imshow(spec.squeeze(), interpolation='nearest', cmap='inferno', origin='lower', aspect='auto') plt.colorbar() plt.title(titles[i]) plt.savefig(os.path.join('./results', 'Test_Output_compare_specs.png')) plt.show()