Example #1
0
 def __init__(self,
              n_fft=2048,
              hop_length=1024,
              n_mels=128,
              sample_rate=16000,
              power=1,
              f_min=40,
              f_max=7600,
              pad_end=True,
              center=False):
     """
     
     """
     super().__init__()
     self.n_fft = n_fft
     self.hop_length = hop_length
     self.power = power
     self.f_min = f_min
     self.f_max = f_max
     self.sample_rate = sample_rate
     self.n_mels = n_mels
     self.pad_end = pad_end
     self.center = center
     self.mel_scale = MelScale(self.n_mels, self.sample_rate, self.f_min,
                               self.f_max, self.n_fft // 2 + 1)
Example #2
0
    def __init__(self, sample_rate=16000, win_ms=25, hop_ms=10, n_freq=201, n_mels=40, n_mfcc=13, feat_list=None, eps=1e-10, **kwargs):
        super(OnlinePreprocessor, self).__init__()
        # save preprocessing arguments
        self._sample_rate = sample_rate
        self._win_ms = win_ms
        self._hop_ms = hop_ms
        self._n_freq = n_freq
        self._n_mels = n_mels
        self._n_mfcc = n_mfcc

        win = round(win_ms * sample_rate / 1000)
        hop = round(hop_ms * sample_rate / 1000)
        n_fft = (n_freq - 1) * 2
        self._win_args = {'n_fft': n_fft, 'hop_length': hop, 'win_length': win}
        self.register_buffer('_window', torch.hann_window(win))
        
        self._stft_args = {'center': True, 'pad_mode': 'reflect', 'normalized': False, 'onesided': True}
        # stft_args: same default values as torchaudio.transforms.Spectrogram & librosa.core.spectrum._spectrogram
        self._stft = partial(torch.stft, **self._win_args, **self._stft_args)
        self._magphase = partial(torchaudio.functional.magphase, power=2)
        self._melscale = MelScale(sample_rate=sample_rate, n_mels=n_mels)
        self._mfcc_trans = MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc, log_mels=True, melkwargs=self._win_args)
        self._istft = partial(torch.istft, **self._win_args, **self._stft_args)
        
        self.feat_list = feat_list
        self.register_buffer('_pseudo_wavs', torch.randn(N_SAMPLED_PSEUDO_WAV, sample_rate))
        self.eps = eps
    def forward(self, batch):
        x = batch['audio']
        
        x = torchaudio.transforms.Spectrogram(power=None, normalized=False).cuda()(x)
        
        if self.training and self.random_time_stretch:
            x, _ = self.random_stretch(x)
        
        # we would do this usually in spectrogram (above: power=False)<-> no ablation needed
        x = self.complex_norm(x) 
    
        if self.mel_scale:
            x = MelScale().cuda()(x)
        
        if self.normalize_spectrogram:
            x = self.norm(x)

        x = x.unsqueeze(1).float()
        x = self.convs(x)
        x = x.view(x.shape[0], x.shape[1], -1)
        x = F.avg_pool1d(x, kernel_size=x.size()[2:]).squeeze(2)
        x = self.dense(x)
        return F.log_softmax(x,dim=1)
Example #4
0
ref_level_db=20

shape=24              #length of time axis of split specrograms to feed to generator
vec_len=128           #length of vector generated by siamese vector
bs = 128               #batch size
delta = 2.            #constant for siamese loss
tag='HAP'             #the tag for the training

"""#helper functions"""

torch.set_default_tensor_type('torch.cuda.FloatTensor')
#MEL-SPECTRUM
print("finally start...")
specobj = Spectrogram(n_fft=6*hop, win_length=6*hop, hop_length=hop, pad=0, power=2, normalized=True)
specfunc = specobj.forward
melobj = MelScale(n_mels=hop, sample_rate=sr, f_min=0.)
melfunc = melobj.forward

def melspecfunc(waveform):
  specgram = specfunc(waveform)
  mel_specgram = melfunc(specgram)
  return mel_specgram

def spectral_convergence(input, target):
    return 20 * ((input - target).norm().log10() - target.norm().log10())

def GRAD(spec, transform_fn, samples=None, init_x0=None, maxiter=1000, tol=1e-6, verbose=1, evaiter=10, lr=0.003):

    spec = torch.Tensor(spec)
    samples = (spec.shape[-1]*hop)-hop
        return report


if __name__ == '__main__':

    # Example Usage

    # loading modulation kernels
    gabor_strf_parameters = torch.load(
        'modulation_kernel_parameters/gabor_strf_parameters.pt',
        map_location=lambda storage, loc: storage)['state_dict']
    gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60)
    gabor_modulation_kernels.load_state_dict(gabor_strf_parameters)

    stft2mel = MelScale(n_mels=80, sample_rate=16000, n_stft=257)

    # initializing the modulation loss
    modulation_loss_module = ModulationDomainLossModule(
        gabor_modulation_kernels.eval())

    # (B, F, T) - pytorch convention
    # predicted enhanced and ground-truth clean STFTM
    enhanced_speech_STFTM = torch.abs(torch.rand(5, 257, 100))
    clean_speech_STFTM = torch.abs(torch.rand(5, 257, 100))

    # Covert to log-mel representation
    # (B,T,#mel_channels)
    clean_log_mel = torch.log(
        torch.transpose(stft2mel(clean_speech_STFTM**2), 2, 1) + 1e-8)
    enhanced_log_mel = torch.log(
Example #6
0
 def __init__(self, sr: int, sg_cfg: SpectrogramConfig):
     self.sg_cfg = sg_cfg
     self.spec = Spectrogram(**sg_cfg.spec_args)
     self.to_mel = MelScale(sample_rate=sr, **sg_cfg.mel_args)
     self.mfcc = MFCC(sample_rate=sr, **sg_cfg.mfcc_args)
     self.to_db = AmplitudeToDB(top_db=sg_cfg.top_db)
Example #7
0
griffin_lim = GriffinLim(n_fft=1024, hop_length=256).to(device)

writer = tensorboard.SummaryWriter(log_dir=f'logs/test')

dataset = Dataset('../DATASETS/LJSpeech-1.1/metadata.csv',
                  '../DATASETS/LJSpeech-1.1')
dataloader = DataLoader(dataset,
                        collate_fn=dataset.collocate,
                        batch_size=batch_size,
                        shuffle=False,
                        num_workers=0,
                        drop_last=True)

resample = Resample(orig_freq=22050, new_freq=sample_rate)
spectogram = Spectrogram(n_fft=1024, hop_length=256).to(device)
to_mel = MelScale(n_mels=80, sample_rate=sample_rate,
                  n_stft=1024 // 2 + 1).to(device)
with open('../DATASETS/LJSpeech-1.1/metadata.csv', encoding='utf8') as file:
    data = [line.strip().split('|') for line in file]
path, text = data[0][0], data[0][1]
path = f'../DATASETS/LJSpeech-1.1/wavs/{path}.wav'
data, sr = torchaudio.load(path)

data = resample(data)
data = data.to(device)

data = spectogram(data.squeeze(0))
mel_norm = (
    (data.unsqueeze(0) - data.mean()) / data.std()).clamp(-1, 1) * .5 + .5
writer.add_image(f'spec/origin', mel_norm, 0)
writer.add_audio(f'audio/origin',
                 griffin_lim(data),