def get_mel(self, filename): if not self.load_mel_from_disk: audio, sampling_rate, max_value = load_wav_to_torch(filename) if self.audio_offset: # used for extreme GTA'ing audio = audio[self.audio_offset:] self.max_wav_value = max( max_value, audio.max().item(), -audio.min().item() ) # I'm not sure how, but sometimes the magnitude of audio exceeds the max of the datatype used before casting. if sampling_rate != self.stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) else: melspec = torch.from_numpy(np.load(filename, allow_pickle=True)).float() assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return melspec
def save_mel(file): audio, sampling_rate = load_wav_to_torch(file) if sampling_rate != stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( file, sampling_rate, stft.sampling_rate)) melspec = stft.mel_spectrogram( audio.unsqueeze(0)).squeeze(0).cpu().numpy() np.save(file.replace('.wav', ''), melspec)
def check_file_lengths(sampling_rate, segment_size, training_files_old): segment_size_s = segment_size/sampling_rate training_files_new = [] for file in tqdm(training_files_old): audio, native_sr = load_wav_to_torch(file, min_sr=sampling_rate*0.9, target_sr=None, return_empty_on_exception=True) audio_s = len(audio) / native_sr if audio_s > segment_size_s and audio_s < 15.0: training_files_new.append(file) return training_files_new
def save_mel(file): audio, sampling_rate = load_wav_to_torch(file) if sampling_rate != stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( file, sampling_rate, stft.sampling_rate)) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0).cpu().numpy() np.save(file.replace('.wav', ''), melspec)
def __getitem__(self, index): audiopath = self.audio_files[index] # load audio if not self.fine_tuning: audio, sampling_rate = load_wav_to_torch(audiopath, target_sr=self.sampling_rate) audio = audio - audio.mean()# remove DC offset audio = (audio / audio.abs().max()) * 0.95# and normalize volume if self.trim_non_voiced:# trim out non-voiced segments assert len(audio.shape) == 1# [B] f0, voiced = self.get_pitch(audio) start_indx, end_indx = get_nonzero_indexes(voiced) audio = audio[start_indx*self.hop_size:end_indx*self.hop_size] else: pm_audio_path = os.path.splitext(audiopath)[0]+'.pm_audio.pt'# predicted mel audio audio = torch.load(pm_audio_path).float()# [T] audio = audio.unsqueeze(0)# [B] -> [1, B] if not self.fine_tuning: if self.split: if audio.size(1) >= self.segment_size: max_audio_start = audio.size(1) - self.segment_size audio_start = random.randint(0, max_audio_start) audio = audio[:, audio_start:audio_start+self.segment_size] else: audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') gt_mel = self.STFT.get_mel(audio) mel = gt_mel# 'mel' is the input to the vocoder, gt_mel is the original mel that will be used as a target by the model. else: pred_mel_path = os.path.splitext(audiopath)[0]+'.pred_mel.pt' mel = torch.load(pred_mel_path).float() if len(mel.shape) == 2: mel = mel.unsqueeze(0)# [n_mel, mel_T] -> [1, n_mel, mel_T] if self.split: frames_per_seg = math.ceil(self.segment_size / self.hop_size) if audio.size(1) >= self.segment_size: mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) mel = mel[:, :, mel_start:mel_start + frames_per_seg] audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size] else: mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant') audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') gt_mel = self.STFT.get_mel(audio) min_mel_len = min(gt_mel.shape[-1], mel.shape[-1]) mel = mel[:, :, :min_mel_len] gt_mel = gt_mel[:, :, :min_mel_len] mel = DTW(mel, gt_mel, 5, 3) return (mel.squeeze(), audio.squeeze(0), audiopath, gt_mel.squeeze())
def get_embed_from_path(self, audiopath): audio, sr = load_wav_to_torch(audiopath, target_sr=16000) spec = get_spect(audio).float().unsqueeze(0)# [1, mel_T, n_mel] spec = spec.to(next(self.parameters())) if spec.shape[-1]%128: spec = torch.nn.functional.pad(spec, (0, 0, 0, spec.shape[-1]%128)) embeds = [] for i in range(0, spec.shape[-1], 128): embed = self(spec[:, :, i:i+128])# [1, 128, 80] -> [1, embed] embeds.append(embed) if i > 5: break embeds = torch.mean(torch.cat(embeds, dim=0), dim=0)# [1, embed] return embed.cpu().float().squeeze(0)# [embed]
def get_audio_text_pair(self, index): audiopath, _, speaker_id, _, duration_path, enc_out_path = self.audiopaths_and_text[ index] encoder_outputs = torch.from_numpy( np.load(enc_out_path)).float() # [enc_T, enc_dim] durations = torch.from_numpy(np.load(duration_path)).float() # [enc_T] audio, sampling_rate = load_wav_to_torch(audiopath) # [T] max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint( 0, max_audio_start // self.hop_length) * self.hop_length audio_segment = audio[audio_start:audio_start + self.segment_length] attention_contexts = self.get_contexts(audio_start, encoder_outputs, durations) # [dec_T, enc_dim] return (audio_segment, attention_contexts, encoder_outputs, durations)
def get_mel(self, filename): if not self.load_mel_from_disk: audio, sampling_rate = load_wav_to_torch(filename) if self.audio_offset: # used for extreme GTA'ing audio = audio[self.audio_offset:] if sampling_rate != self.stft.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) melspec = self.stft.mel_spectrogram(audio.unsqueeze(0)).squeeze(0) else: melspec = torch.from_numpy(np.load(filename, allow_pickle=True)).float() assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return melspec