Exemple #1
0
    def __getitem__(self, index):
        (text, _, log_mel, log_mel_length, audio, audio_length, _, pitch,
         energy) = super().__getitem__(index)

        phones_tokenized = torch.tensor(self.vocab.encode(text)).long()
        phones_length = torch.tensor(len(phones_tokenized)).long()

        ### Make duration attention prior if not exist in the supplementary folder
        prior_path = Path(self.supplementary_folder
                          ) / f"pr_tl{phones_length}_al_{log_mel_length}.pt"
        if prior_path.exists():
            duration_prior = torch.load(prior_path)
        else:
            duration_prior = beta_binomial_prior_distribution(
                phones_length, log_mel_length)
            duration_prior = torch.from_numpy(duration_prior)
            torch.save(duration_prior, prior_path)

        return (
            phones_tokenized,
            phones_length,
            log_mel,
            log_mel_length,
            audio,
            audio_length,
            duration_prior,
            pitch,
            energy,
        )
Exemple #2
0
    def __getitem__(self, index):
        sample = self.data[index]

        # Let's keep audio name and all internal directories in rel_audio_path_as_text_id to avoid any collisions
        rel_audio_path = Path(sample["audio_filepath"]).relative_to(
            self.base_data_dir).with_suffix("")
        rel_audio_path_as_text_id = str(rel_audio_path).replace("/", "_")

        # Load audio
        features = self.featurizer.process(sample["audio_filepath"],
                                           trim=self.trim)
        audio, audio_length = features, torch.tensor(features.shape[0]).long()

        # Load text
        text = torch.tensor(sample["text_tokens"]).long()
        text_length = torch.tensor(len(sample["text_tokens"])).long()

        # Load mel if needed
        log_mel, log_mel_length = None, None
        if LogMel in self.sup_data_types_set:
            mel_path = sample["mel_filepath"]

            if mel_path is not None and Path(mel_path).exists():
                log_mel = torch.load(mel_path)
            else:
                mel_path = self.log_mel_folder / f"{rel_audio_path_as_text_id}.pt"

                if mel_path.exists():
                    log_mel = torch.load(mel_path)
                else:
                    log_mel = self.get_log_mel(audio)
                    torch.save(log_mel, mel_path)

            log_mel = log_mel.squeeze(0)
            log_mel_length = torch.tensor(log_mel.shape[1]).long()

        # Load durations if needed
        durations = None
        if Durations in self.sup_data_types_set:
            durations = self.durs[index]

        # Load alignment prior matrix if needed
        align_prior_matrix = None
        if AlignPriorMatrix in self.sup_data_types_set:
            if self.use_beta_binomial_interpolator:
                mel_len = self.get_log_mel(audio).shape[2]
                align_prior_matrix = torch.from_numpy(
                    self.beta_binomial_interpolator(mel_len,
                                                    text_length.item()))
            else:
                prior_path = self.align_prior_matrix_folder / f"{rel_audio_path_as_text_id}.pt"

                if prior_path.exists():
                    align_prior_matrix = torch.load(prior_path)
                else:
                    mel_len = self.get_log_mel(audio).shape[2]
                    align_prior_matrix = beta_binomial_prior_distribution(
                        text_length, mel_len)
                    align_prior_matrix = torch.from_numpy(align_prior_matrix)
                    torch.save(align_prior_matrix, prior_path)

        # Load pitch if needed
        pitch, pitch_length = None, None
        if Pitch in self.sup_data_types_set:
            pitch_path = self.pitch_folder / f"{rel_audio_path_as_text_id}.pt"

            if pitch_path.exists():
                pitch = torch.load(pitch_path).float()
            else:
                pitch, _, _ = librosa.pyin(
                    audio.numpy(),
                    fmin=self.pitch_fmin,
                    fmax=self.pitch_fmax,
                    frame_length=self.win_length,
                    sr=self.sample_rate,
                    fill_na=0.0,
                )
                pitch = torch.from_numpy(pitch).float()
                torch.save(pitch, pitch_path)

            if self.pitch_mean is not None and self.pitch_std is not None and self.pitch_norm:
                pitch -= self.pitch_mean
                pitch[
                    pitch == -self.
                    pitch_mean] = 0.0  # Zero out values that were perviously zero
                pitch /= self.pitch_std

            pitch_length = torch.tensor(len(pitch)).long()

        # Load energy if needed
        energy, energy_length = None, None
        if Energy in self.sup_data_types_set:
            energy_path = self.energy_folder / f"{rel_audio_path_as_text_id}.pt"

            if energy_path.exists():
                energy = torch.load(energy_path).float()
            else:
                spec = self.get_spec(audio)
                energy = torch.linalg.norm(spec.squeeze(0), axis=0).float()
                torch.save(energy, energy_path)

            energy_length = torch.tensor(len(energy)).long()

        # Load speaker id if needed
        speaker_id = None
        if SpeakerID in self.sup_data_types_set:
            speaker_id = torch.tensor(sample["speaker_id"]).long()

        return (
            audio,
            audio_length,
            text,
            text_length,
            log_mel,
            log_mel_length,
            durations,
            align_prior_matrix,
            pitch,
            pitch_length,
            energy,
            energy_length,
            speaker_id,
        )
Exemple #3
0
    def __getitem__(self, index):
        sample = self.data[index]
        audio_stem = Path(sample["audio_filepath"]).stem

        features = self.featurizer.process(sample["audio_filepath"],
                                           trim=self.trim)
        audio, audio_length = features, torch.tensor(features.shape[0]).long()

        text = torch.tensor(sample["text_tokens"]).long()
        text_length = torch.tensor(len(sample["text_tokens"])).long()

        log_mel, log_mel_length = None, None
        if LogMel in self.sup_data_types_set:
            mel_path = sample["mel_filepath"]

            if mel_path is not None and Path(mel_path).exists():
                log_mel = torch.load(mel_path)
            else:
                mel_path = Path(self.sup_data_path) / f"mel_{audio_stem}.pt"

                if mel_path.exists():
                    log_mel = torch.load(mel_path)
                else:
                    log_mel = self.get_log_mel(audio)
                    torch.save(log_mel, mel_path)

            log_mel = log_mel.squeeze(0)
            log_mel_length = torch.tensor(log_mel.shape[1]).long()

        durations = None
        if Durations in self.sup_data_types_set:
            durations = self.durs[index]

        duration_prior = None
        if DurationPrior in self.sup_data_types_set:
            if self.use_beta_binomial_interpolator:
                mel_len = self.get_log_mel(audio).shape[2]
                duration_prior = torch.from_numpy(
                    self.beta_binomial_interpolator(mel_len,
                                                    text_length.item()))
            else:
                prior_path = Path(self.sup_data_path) / f"pr_{audio_stem}.pt"

                if prior_path.exists():
                    duration_prior = torch.load(prior_path)
                else:
                    mel_len = self.get_log_mel(audio).shape[2]
                    duration_prior = beta_binomial_prior_distribution(
                        text_length, mel_len)
                    duration_prior = torch.from_numpy(duration_prior)
                    torch.save(duration_prior, prior_path)

        pitch, pitch_length = None, None
        if Pitch in self.sup_data_types_set:
            pitch_name = (f"{audio_stem}_pitch_pyin_"
                          f"fmin{self.pitch_fmin}_fmax{self.pitch_fmax}_"
                          f"fl{self.win_length}_hs{self.hop_len}.pt")

            pitch_path = Path(self.sup_data_path) / pitch_name
            if pitch_path.exists():
                pitch = torch.load(pitch_path).float()
            else:
                pitch, _, _ = librosa.pyin(
                    audio.numpy(),
                    fmin=self.pitch_fmin,
                    fmax=self.pitch_fmax,
                    frame_length=self.win_length,
                    sr=self.sample_rate,
                    fill_na=0.0,
                )
                pitch = torch.from_numpy(pitch).float()
                torch.save(pitch, pitch_path)

            if self.pitch_avg is not None and self.pitch_std is not None and self.pitch_norm:
                pitch -= self.pitch_avg
                pitch[
                    pitch == -self.
                    pitch_avg] = 0.0  # Zero out values that were perviously zero
                pitch /= self.pitch_std

            pitch_length = torch.tensor(len(pitch)).long()

        energy, energy_length = None, None
        if Energy in self.sup_data_types_set:
            energy_path = Path(
                self.sup_data_path
            ) / f"{audio_stem}_energy_wl{self.win_length}_hs{self.hop_len}.pt"
            if energy_path.exists():
                energy = torch.load(energy_path).float()
            else:
                spec = self.get_spec(audio)
                energy = torch.linalg.norm(spec.squeeze(0), axis=0).float()
                torch.save(energy, energy_path)

            energy_length = torch.tensor(len(energy)).long()

        speaker_id = None
        if SpeakerID in self.sup_data_types_set:
            speaker_id = torch.tensor(sample["speaker_id"]).long()

        return (
            audio,
            audio_length,
            text,
            text_length,
            log_mel,
            log_mel_length,
            durations,
            duration_prior,
            pitch,
            pitch_length,
            energy,
            energy_length,
            speaker_id,
        )
Exemple #4
0
    def __getitem__(self, index):
        spec = None
        sample = self.data[index]

        features = self.featurizer.process(sample["audio_filepath"],
                                           trim=self.trim)
        audio, audio_length = features, torch.tensor(features.shape[0]).long()
        if isinstance(sample["text_tokens"], str):
            # If tokenize_text is False for Phone dataset
            text = sample["text_tokens"]
            text_length = None
        else:
            text = torch.tensor(sample["text_tokens"]).long()
            text_length = torch.tensor(len(sample["text_tokens"])).long()
        audio_stem = Path(sample["audio_filepath"]).stem

        # Load mel if it exists
        mel_path = sample["mel_filepath"]
        if mel_path and Path(mel_path).exists():
            log_mel = torch.load(mel_path)
        else:
            mel_path = Path(self.supplementary_folder) / f"mel_{audio_stem}.pt"
            if mel_path.exists():
                log_mel = torch.load(mel_path)
            else:
                # disable autocast to get full range of stft values
                with torch.cuda.amp.autocast(enabled=False):
                    spec = self.stft(audio)

                    # guard is needed for sqrt if grads are passed through
                    guard = CONSTANT  # TODO: Enable 0 if not self.use_grads else CONSTANT
                    if spec.dtype in [torch.cfloat, torch.cdouble]:
                        spec = torch.view_as_real(spec)
                    spec = torch.sqrt(spec.pow(2).sum(-1) + guard)

                    mel = torch.matmul(self.fb.to(spec.dtype), spec)

                    log_mel = torch.log(
                        torch.clamp(mel, min=torch.finfo(mel.dtype).tiny))
                    torch.save(log_mel, mel_path)

        log_mel = log_mel.squeeze(0)
        log_mel_length = torch.tensor(log_mel.shape[1]).long()

        duration_prior = None
        if text_length is not None:
            ### Make duration attention prior if not exist in the supplementary folder
            prior_path = Path(self.supplementary_folder
                              ) / f"pr_tl{text_length}_al_{log_mel_length}.pt"
            if prior_path.exists():
                duration_prior = torch.load(prior_path)
            else:
                duration_prior = beta_binomial_prior_distribution(
                    text_length, log_mel_length)
                duration_prior = torch.from_numpy(duration_prior)
                torch.save(duration_prior, prior_path)

        # Load pitch file (F0s)
        pitch_path = (
            Path(self.supplementary_folder) /
            f"{audio_stem}_pitch_pyin_fmin{self.pitch_fmin}_fmax{self.pitch_fmax}_fl{self.win_length}_hs{self.hop_len}.pt"
        )
        if pitch_path.exists():
            pitch = torch.load(pitch_path)
        else:
            pitch, _, _ = librosa.pyin(
                audio.numpy(),
                fmin=self.pitch_fmin,
                fmax=self.pitch_fmax,
                frame_length=self.win_length,
                sr=self.sample_rate,
                fill_na=0.0,
            )
            pitch = torch.from_numpy(pitch)
            torch.save(pitch, pitch_path)
        # Standize pitch
        pitch -= self.pitch_avg
        pitch[pitch == -self.
              pitch_avg] = 0.0  # Zero out values that were perviously zero
        pitch /= self.pitch_std

        # Load energy file (L2-norm of the amplitude of each STFT frame of an utterance)
        energy_path = Path(
            self.supplementary_folder
        ) / f"{audio_stem}_energy_wl{self.win_length}_hs{self.hop_len}.pt"
        if energy_path.exists():
            energy = torch.load(energy_path)
        else:
            if spec is None:
                spec = self.stft(audio)
            energy = torch.linalg.norm(spec.squeeze(0), axis=0)
            # Save to new file
            torch.save(energy, energy_path)

        return text, text_length, log_mel, log_mel_length, audio, audio_length, duration_prior, pitch, energy