def read_aud_sim(self, fname): # VAD = webrtcvad.Vad(self.config['VAD_MODE']) # aud, sr = librosa.load(fname, sr=None, mono=True) aud = preprocess(fname) # soundfile.write('data/vis/{}.wav'.format(fname.split('/')[-1].split('.')[0]),aud,16000) mel = torchaudio.transforms.MelSpectrogram( n_mels=self.config['MEL_CHANNELS'], n_fft=1024, hop_length=256, win_length=1024)(torch.Tensor(aud)) return mel
def infer(self, fname, cpt): aud = preprocess(fname) embeds = encoder.embed(aud) return embeds