def __getitem__(self, index): (text, _, log_mel, log_mel_length, audio, audio_length, _, pitch, energy) = super().__getitem__(index) phones_tokenized = torch.tensor(self.vocab.encode(text)).long() phones_length = torch.tensor(len(phones_tokenized)).long() ### Make duration attention prior if not exist in the supplementary folder prior_path = Path(self.supplementary_folder ) / f"pr_tl{phones_length}_al_{log_mel_length}.pt" if prior_path.exists(): duration_prior = torch.load(prior_path) else: duration_prior = beta_binomial_prior_distribution( phones_length, log_mel_length) duration_prior = torch.from_numpy(duration_prior) torch.save(duration_prior, prior_path) return ( phones_tokenized, phones_length, log_mel, log_mel_length, audio, audio_length, duration_prior, pitch, energy, )
def __getitem__(self, index): sample = self.data[index] # Let's keep audio name and all internal directories in rel_audio_path_as_text_id to avoid any collisions rel_audio_path = Path(sample["audio_filepath"]).relative_to( self.base_data_dir).with_suffix("") rel_audio_path_as_text_id = str(rel_audio_path).replace("/", "_") # Load audio features = self.featurizer.process(sample["audio_filepath"], trim=self.trim) audio, audio_length = features, torch.tensor(features.shape[0]).long() # Load text text = torch.tensor(sample["text_tokens"]).long() text_length = torch.tensor(len(sample["text_tokens"])).long() # Load mel if needed log_mel, log_mel_length = None, None if LogMel in self.sup_data_types_set: mel_path = sample["mel_filepath"] if mel_path is not None and Path(mel_path).exists(): log_mel = torch.load(mel_path) else: mel_path = self.log_mel_folder / f"{rel_audio_path_as_text_id}.pt" if mel_path.exists(): log_mel = torch.load(mel_path) else: log_mel = self.get_log_mel(audio) torch.save(log_mel, mel_path) log_mel = log_mel.squeeze(0) log_mel_length = torch.tensor(log_mel.shape[1]).long() # Load durations if needed durations = None if Durations in self.sup_data_types_set: durations = self.durs[index] # Load alignment prior matrix if needed align_prior_matrix = None if AlignPriorMatrix in self.sup_data_types_set: if self.use_beta_binomial_interpolator: mel_len = self.get_log_mel(audio).shape[2] align_prior_matrix = torch.from_numpy( self.beta_binomial_interpolator(mel_len, text_length.item())) else: prior_path = self.align_prior_matrix_folder / f"{rel_audio_path_as_text_id}.pt" if prior_path.exists(): align_prior_matrix = torch.load(prior_path) else: mel_len = self.get_log_mel(audio).shape[2] align_prior_matrix = beta_binomial_prior_distribution( text_length, mel_len) align_prior_matrix = torch.from_numpy(align_prior_matrix) torch.save(align_prior_matrix, prior_path) # Load pitch if needed pitch, pitch_length = None, None if Pitch in self.sup_data_types_set: pitch_path = self.pitch_folder / f"{rel_audio_path_as_text_id}.pt" if pitch_path.exists(): pitch = torch.load(pitch_path).float() else: pitch, _, _ = librosa.pyin( audio.numpy(), fmin=self.pitch_fmin, fmax=self.pitch_fmax, frame_length=self.win_length, sr=self.sample_rate, fill_na=0.0, ) pitch = torch.from_numpy(pitch).float() torch.save(pitch, pitch_path) if self.pitch_mean is not None and self.pitch_std is not None and self.pitch_norm: pitch -= self.pitch_mean pitch[ pitch == -self. pitch_mean] = 0.0 # Zero out values that were perviously zero pitch /= self.pitch_std pitch_length = torch.tensor(len(pitch)).long() # Load energy if needed energy, energy_length = None, None if Energy in self.sup_data_types_set: energy_path = self.energy_folder / f"{rel_audio_path_as_text_id}.pt" if energy_path.exists(): energy = torch.load(energy_path).float() else: spec = self.get_spec(audio) energy = torch.linalg.norm(spec.squeeze(0), axis=0).float() torch.save(energy, energy_path) energy_length = torch.tensor(len(energy)).long() # Load speaker id if needed speaker_id = None if SpeakerID in self.sup_data_types_set: speaker_id = torch.tensor(sample["speaker_id"]).long() return ( audio, audio_length, text, text_length, log_mel, log_mel_length, durations, align_prior_matrix, pitch, pitch_length, energy, energy_length, speaker_id, )
def __getitem__(self, index): sample = self.data[index] audio_stem = Path(sample["audio_filepath"]).stem features = self.featurizer.process(sample["audio_filepath"], trim=self.trim) audio, audio_length = features, torch.tensor(features.shape[0]).long() text = torch.tensor(sample["text_tokens"]).long() text_length = torch.tensor(len(sample["text_tokens"])).long() log_mel, log_mel_length = None, None if LogMel in self.sup_data_types_set: mel_path = sample["mel_filepath"] if mel_path is not None and Path(mel_path).exists(): log_mel = torch.load(mel_path) else: mel_path = Path(self.sup_data_path) / f"mel_{audio_stem}.pt" if mel_path.exists(): log_mel = torch.load(mel_path) else: log_mel = self.get_log_mel(audio) torch.save(log_mel, mel_path) log_mel = log_mel.squeeze(0) log_mel_length = torch.tensor(log_mel.shape[1]).long() durations = None if Durations in self.sup_data_types_set: durations = self.durs[index] duration_prior = None if DurationPrior in self.sup_data_types_set: if self.use_beta_binomial_interpolator: mel_len = self.get_log_mel(audio).shape[2] duration_prior = torch.from_numpy( self.beta_binomial_interpolator(mel_len, text_length.item())) else: prior_path = Path(self.sup_data_path) / f"pr_{audio_stem}.pt" if prior_path.exists(): duration_prior = torch.load(prior_path) else: mel_len = self.get_log_mel(audio).shape[2] duration_prior = beta_binomial_prior_distribution( text_length, mel_len) duration_prior = torch.from_numpy(duration_prior) torch.save(duration_prior, prior_path) pitch, pitch_length = None, None if Pitch in self.sup_data_types_set: pitch_name = (f"{audio_stem}_pitch_pyin_" f"fmin{self.pitch_fmin}_fmax{self.pitch_fmax}_" f"fl{self.win_length}_hs{self.hop_len}.pt") pitch_path = Path(self.sup_data_path) / pitch_name if pitch_path.exists(): pitch = torch.load(pitch_path).float() else: pitch, _, _ = librosa.pyin( audio.numpy(), fmin=self.pitch_fmin, fmax=self.pitch_fmax, frame_length=self.win_length, sr=self.sample_rate, fill_na=0.0, ) pitch = torch.from_numpy(pitch).float() torch.save(pitch, pitch_path) if self.pitch_avg is not None and self.pitch_std is not None and self.pitch_norm: pitch -= self.pitch_avg pitch[ pitch == -self. pitch_avg] = 0.0 # Zero out values that were perviously zero pitch /= self.pitch_std pitch_length = torch.tensor(len(pitch)).long() energy, energy_length = None, None if Energy in self.sup_data_types_set: energy_path = Path( self.sup_data_path ) / f"{audio_stem}_energy_wl{self.win_length}_hs{self.hop_len}.pt" if energy_path.exists(): energy = torch.load(energy_path).float() else: spec = self.get_spec(audio) energy = torch.linalg.norm(spec.squeeze(0), axis=0).float() torch.save(energy, energy_path) energy_length = torch.tensor(len(energy)).long() speaker_id = None if SpeakerID in self.sup_data_types_set: speaker_id = torch.tensor(sample["speaker_id"]).long() return ( audio, audio_length, text, text_length, log_mel, log_mel_length, durations, duration_prior, pitch, pitch_length, energy, energy_length, speaker_id, )
def __getitem__(self, index): spec = None sample = self.data[index] features = self.featurizer.process(sample["audio_filepath"], trim=self.trim) audio, audio_length = features, torch.tensor(features.shape[0]).long() if isinstance(sample["text_tokens"], str): # If tokenize_text is False for Phone dataset text = sample["text_tokens"] text_length = None else: text = torch.tensor(sample["text_tokens"]).long() text_length = torch.tensor(len(sample["text_tokens"])).long() audio_stem = Path(sample["audio_filepath"]).stem # Load mel if it exists mel_path = sample["mel_filepath"] if mel_path and Path(mel_path).exists(): log_mel = torch.load(mel_path) else: mel_path = Path(self.supplementary_folder) / f"mel_{audio_stem}.pt" if mel_path.exists(): log_mel = torch.load(mel_path) else: # disable autocast to get full range of stft values with torch.cuda.amp.autocast(enabled=False): spec = self.stft(audio) # guard is needed for sqrt if grads are passed through guard = CONSTANT # TODO: Enable 0 if not self.use_grads else CONSTANT if spec.dtype in [torch.cfloat, torch.cdouble]: spec = torch.view_as_real(spec) spec = torch.sqrt(spec.pow(2).sum(-1) + guard) mel = torch.matmul(self.fb.to(spec.dtype), spec) log_mel = torch.log( torch.clamp(mel, min=torch.finfo(mel.dtype).tiny)) torch.save(log_mel, mel_path) log_mel = log_mel.squeeze(0) log_mel_length = torch.tensor(log_mel.shape[1]).long() duration_prior = None if text_length is not None: ### Make duration attention prior if not exist in the supplementary folder prior_path = Path(self.supplementary_folder ) / f"pr_tl{text_length}_al_{log_mel_length}.pt" if prior_path.exists(): duration_prior = torch.load(prior_path) else: duration_prior = beta_binomial_prior_distribution( text_length, log_mel_length) duration_prior = torch.from_numpy(duration_prior) torch.save(duration_prior, prior_path) # Load pitch file (F0s) pitch_path = ( Path(self.supplementary_folder) / f"{audio_stem}_pitch_pyin_fmin{self.pitch_fmin}_fmax{self.pitch_fmax}_fl{self.win_length}_hs{self.hop_len}.pt" ) if pitch_path.exists(): pitch = torch.load(pitch_path) else: pitch, _, _ = librosa.pyin( audio.numpy(), fmin=self.pitch_fmin, fmax=self.pitch_fmax, frame_length=self.win_length, sr=self.sample_rate, fill_na=0.0, ) pitch = torch.from_numpy(pitch) torch.save(pitch, pitch_path) # Standize pitch pitch -= self.pitch_avg pitch[pitch == -self. pitch_avg] = 0.0 # Zero out values that were perviously zero pitch /= self.pitch_std # Load energy file (L2-norm of the amplitude of each STFT frame of an utterance) energy_path = Path( self.supplementary_folder ) / f"{audio_stem}_energy_wl{self.win_length}_hs{self.hop_len}.pt" if energy_path.exists(): energy = torch.load(energy_path) else: if spec is None: spec = self.stft(audio) energy = torch.linalg.norm(spec.squeeze(0), axis=0) # Save to new file torch.save(energy, energy_path) return text, text_length, log_mel, log_mel_length, audio, audio_length, duration_prior, pitch, energy