def __init__(self, n_fft=2048, hop_length=1024, n_mels=128, sample_rate=16000, power=1, f_min=40, f_max=7600, pad_end=True, center=False): """ """ super().__init__() self.n_fft = n_fft self.hop_length = hop_length self.power = power self.f_min = f_min self.f_max = f_max self.sample_rate = sample_rate self.n_mels = n_mels self.pad_end = pad_end self.center = center self.mel_scale = MelScale(self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1)
def __init__(self, sample_rate=16000, win_ms=25, hop_ms=10, n_freq=201, n_mels=40, n_mfcc=13, feat_list=None, eps=1e-10, **kwargs): super(OnlinePreprocessor, self).__init__() # save preprocessing arguments self._sample_rate = sample_rate self._win_ms = win_ms self._hop_ms = hop_ms self._n_freq = n_freq self._n_mels = n_mels self._n_mfcc = n_mfcc win = round(win_ms * sample_rate / 1000) hop = round(hop_ms * sample_rate / 1000) n_fft = (n_freq - 1) * 2 self._win_args = {'n_fft': n_fft, 'hop_length': hop, 'win_length': win} self.register_buffer('_window', torch.hann_window(win)) self._stft_args = {'center': True, 'pad_mode': 'reflect', 'normalized': False, 'onesided': True} # stft_args: same default values as torchaudio.transforms.Spectrogram & librosa.core.spectrum._spectrogram self._stft = partial(torch.stft, **self._win_args, **self._stft_args) self._magphase = partial(torchaudio.functional.magphase, power=2) self._melscale = MelScale(sample_rate=sample_rate, n_mels=n_mels) self._mfcc_trans = MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc, log_mels=True, melkwargs=self._win_args) self._istft = partial(torch.istft, **self._win_args, **self._stft_args) self.feat_list = feat_list self.register_buffer('_pseudo_wavs', torch.randn(N_SAMPLED_PSEUDO_WAV, sample_rate)) self.eps = eps
def forward(self, batch): x = batch['audio'] x = torchaudio.transforms.Spectrogram(power=None, normalized=False).cuda()(x) if self.training and self.random_time_stretch: x, _ = self.random_stretch(x) # we would do this usually in spectrogram (above: power=False)<-> no ablation needed x = self.complex_norm(x) if self.mel_scale: x = MelScale().cuda()(x) if self.normalize_spectrogram: x = self.norm(x) x = x.unsqueeze(1).float() x = self.convs(x) x = x.view(x.shape[0], x.shape[1], -1) x = F.avg_pool1d(x, kernel_size=x.size()[2:]).squeeze(2) x = self.dense(x) return F.log_softmax(x,dim=1)
ref_level_db=20 shape=24 #length of time axis of split specrograms to feed to generator vec_len=128 #length of vector generated by siamese vector bs = 128 #batch size delta = 2. #constant for siamese loss tag='HAP' #the tag for the training """#helper functions""" torch.set_default_tensor_type('torch.cuda.FloatTensor') #MEL-SPECTRUM print("finally start...") specobj = Spectrogram(n_fft=6*hop, win_length=6*hop, hop_length=hop, pad=0, power=2, normalized=True) specfunc = specobj.forward melobj = MelScale(n_mels=hop, sample_rate=sr, f_min=0.) melfunc = melobj.forward def melspecfunc(waveform): specgram = specfunc(waveform) mel_specgram = melfunc(specgram) return mel_specgram def spectral_convergence(input, target): return 20 * ((input - target).norm().log10() - target.norm().log10()) def GRAD(spec, transform_fn, samples=None, init_x0=None, maxiter=1000, tol=1e-6, verbose=1, evaiter=10, lr=0.003): spec = torch.Tensor(spec) samples = (spec.shape[-1]*hop)-hop
return report if __name__ == '__main__': # Example Usage # loading modulation kernels gabor_strf_parameters = torch.load( 'modulation_kernel_parameters/gabor_strf_parameters.pt', map_location=lambda storage, loc: storage)['state_dict'] gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60) gabor_modulation_kernels.load_state_dict(gabor_strf_parameters) stft2mel = MelScale(n_mels=80, sample_rate=16000, n_stft=257) # initializing the modulation loss modulation_loss_module = ModulationDomainLossModule( gabor_modulation_kernels.eval()) # (B, F, T) - pytorch convention # predicted enhanced and ground-truth clean STFTM enhanced_speech_STFTM = torch.abs(torch.rand(5, 257, 100)) clean_speech_STFTM = torch.abs(torch.rand(5, 257, 100)) # Covert to log-mel representation # (B,T,#mel_channels) clean_log_mel = torch.log( torch.transpose(stft2mel(clean_speech_STFTM**2), 2, 1) + 1e-8) enhanced_log_mel = torch.log(
def __init__(self, sr: int, sg_cfg: SpectrogramConfig): self.sg_cfg = sg_cfg self.spec = Spectrogram(**sg_cfg.spec_args) self.to_mel = MelScale(sample_rate=sr, **sg_cfg.mel_args) self.mfcc = MFCC(sample_rate=sr, **sg_cfg.mfcc_args) self.to_db = AmplitudeToDB(top_db=sg_cfg.top_db)
griffin_lim = GriffinLim(n_fft=1024, hop_length=256).to(device) writer = tensorboard.SummaryWriter(log_dir=f'logs/test') dataset = Dataset('../DATASETS/LJSpeech-1.1/metadata.csv', '../DATASETS/LJSpeech-1.1') dataloader = DataLoader(dataset, collate_fn=dataset.collocate, batch_size=batch_size, shuffle=False, num_workers=0, drop_last=True) resample = Resample(orig_freq=22050, new_freq=sample_rate) spectogram = Spectrogram(n_fft=1024, hop_length=256).to(device) to_mel = MelScale(n_mels=80, sample_rate=sample_rate, n_stft=1024 // 2 + 1).to(device) with open('../DATASETS/LJSpeech-1.1/metadata.csv', encoding='utf8') as file: data = [line.strip().split('|') for line in file] path, text = data[0][0], data[0][1] path = f'../DATASETS/LJSpeech-1.1/wavs/{path}.wav' data, sr = torchaudio.load(path) data = resample(data) data = data.to(device) data = spectogram(data.squeeze(0)) mel_norm = ( (data.unsqueeze(0) - data.mean()) / data.std()).clamp(-1, 1) * .5 + .5 writer.add_image(f'spec/origin', mel_norm, 0) writer.add_audio(f'audio/origin', griffin_lim(data),