コード例 #1
0
ファイル: TTSDataset.py プロジェクト: a-froghyar/Capacitron
    def compute_input_seq(self, num_workers=0):
        """compute input sequences separately. Call it before
        passing dataset to data loader."""
        if not self.use_phonemes:
            if self.verbose:
                print(" | > Computing input sequences ...")
            for idx, item in enumerate(tqdm.tqdm(self.items)):
                text, *_ = item
                sequence = np.asarray(text_to_sequence(
                    text, [self.cleaners],
                    tp=self.tp,
                    add_blank=self.add_blank),
                                      dtype=np.int32)
                self.items[idx][0] = sequence

        else:
            func_args = [
                self.phoneme_cache_path, self.enable_eos_bos, self.cleaners,
                self.phoneme_language, self.tp, self.add_blank
            ]
            if self.verbose:
                print(" | > Computing phonemes ...")
            if num_workers == 0:
                for idx, item in enumerate(tqdm.tqdm(self.items)):
                    phonemes = self._phoneme_worker([item, func_args])
                    self.items[idx][0] = phonemes
            else:
                with Pool(num_workers) as p:
                    phonemes = list(
                        tqdm.tqdm(p.imap(MyDataset._phoneme_worker,
                                         [[item, func_args]
                                          for item in self.items]),
                                  total=len(self.items)))
                    for idx, p in enumerate(phonemes):
                        self.items[idx][0] = p
コード例 #2
0
ファイル: TTSDataset.py プロジェクト: zententacles/TTS
    def load_data(self, idx):
        item = self.items[idx]

        if len(item) == 4:
            text, wav_file, speaker_name, attn_file = item
        else:
            text, wav_file, speaker_name = item
            attn = None

        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)

        if self.use_phonemes:
            text = self._load_or_generate_phoneme_sequence(wav_file, text)
        else:
            text = np.asarray(text_to_sequence(text, [self.cleaners],
                                               tp=self.tp,
                                               add_blank=self.add_blank),
                              dtype=np.int32)

        assert text.size > 0, self.items[idx][1]
        assert wav.size > 0, self.items[idx][1]

        if "attn_file" in locals():
            attn = np.load(attn_file)

        sample = {
            'text': text,
            'wav': wav,
            'attn': attn,
            'item_idx': self.items[idx][1],
            'speaker_name': speaker_name,
            'wav_file_name': os.path.basename(wav_file)
        }
        return sample
コード例 #3
0
    def load_data(self, idx):
        item = self.items[idx]

        if len(item) == 4:
            text, wav_file, speaker_name, attn_file = item
        else:
            text, wav_file, speaker_name = item
            attn = None

        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)

        # apply noise for augmentation
        if self.use_noise_augment:
            wav = wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)

        if not self.input_seq_computed:
            if self.use_phonemes:
                text = self._load_or_generate_phoneme_sequence(
                    wav_file,
                    text,
                    self.phoneme_cache_path,
                    self.enable_eos_bos,
                    self.cleaners,
                    self.phoneme_language,
                    self.tp,
                    self.add_blank,
                )

            else:
                text = np.asarray(text_to_sequence(text, [self.cleaners],
                                                   tp=self.tp,
                                                   add_blank=self.add_blank),
                                  dtype=np.int32)

        assert text.size > 0, self.items[idx][1]
        assert wav.size > 0, self.items[idx][1]

        if "attn_file" in locals():
            attn = np.load(attn_file)

        if len(text) > self.max_seq_len:
            # return a different sample if the phonemized
            # text is longer than the threshold
            # TODO: find a better fix
            return self.load_data(100)

        sample = {
            "text": text,
            "wav": wav,
            "attn": attn,
            "item_idx": self.items[idx][1],
            "speaker_name": speaker_name,
            "wav_file_name": os.path.basename(wav_file),
        }
        return sample
コード例 #4
0
ファイル: dataset.py プロジェクト: stjordanis/TTS
    def compute_input_seq(self, num_workers=0):
        """Compute the input sequences with multi-processing.
        Call it before passing dataset to the data loader to cache the input sequences for faster data loading."""
        if not self.use_phonemes:
            if self.verbose:
                print(" | > Computing input sequences ...")
            for idx, item in enumerate(tqdm.tqdm(self.items)):
                text, *_ = item
                sequence = np.asarray(
                    text_to_sequence(
                        text,
                        [self.cleaners],
                        custom_symbols=self.custom_symbols,
                        tp=self.characters,
                        add_blank=self.add_blank,
                    ),
                    dtype=np.int32,
                )
                self.items[idx][0] = sequence

        else:
            func_args = [
                self.phoneme_cache_path,
                self.enable_eos_bos,
                self.cleaners,
                self.phoneme_language,
                self.custom_symbols,
                self.characters,
                self.add_blank,
            ]
            if self.verbose:
                print(" | > Computing phonemes ...")
            if num_workers == 0:
                for idx, item in enumerate(tqdm.tqdm(self.items)):
                    phonemes = self._phoneme_worker([item, func_args])
                    self.items[idx][0] = phonemes
            else:
                with Pool(num_workers) as p:
                    phonemes = list(
                        tqdm.tqdm(
                            p.imap(TTSDataset._phoneme_worker,
                                   [[item, func_args] for item in self.items]),
                            total=len(self.items),
                        ))
                    for idx, p in enumerate(phonemes):
                        self.items[idx][0] = p
コード例 #5
0
ファイル: TTSDataset.py プロジェクト: cs50victor/riri
    def load_data(self, idx):
        text, wav_file, speaker_name = self.items[idx]
        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)

        if self.use_phonemes:
            text = self._load_or_generate_phoneme_sequence(wav_file, text)
        else:
            text = np.asarray(text_to_sequence(text, [self.cleaners],
                                               tp=self.tp),
                              dtype=np.int32)

        assert text.size > 0, self.items[idx][1]
        assert wav.size > 0, self.items[idx][1]

        sample = {
            'text': text,
            'wav': wav,
            'item_idx': self.items[idx][1],
            'speaker_name': speaker_name
        }
        return sample
コード例 #6
0
ファイル: dataset.py プロジェクト: stjordanis/TTS
    def load_data(self, idx):
        item = self.items[idx]

        if len(item) == 5:
            text, wav_file, speaker_name, language_name, attn_file = item
        else:
            text, wav_file, speaker_name, language_name = item
            attn = None
        raw_text = text

        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)

        # apply noise for augmentation
        if self.use_noise_augment:
            wav = wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)

        if not self.input_seq_computed:
            if self.use_phonemes:
                text = self._load_or_generate_phoneme_sequence(
                    wav_file,
                    text,
                    self.phoneme_cache_path,
                    self.enable_eos_bos,
                    self.cleaners,
                    language_name if language_name else self.phoneme_language,
                    self.custom_symbols,
                    self.characters,
                    self.add_blank,
                )
            else:
                text = np.asarray(
                    text_to_sequence(
                        text,
                        [self.cleaners],
                        custom_symbols=self.custom_symbols,
                        tp=self.characters,
                        add_blank=self.add_blank,
                    ),
                    dtype=np.int32,
                )

        assert text.size > 0, self.items[idx][1]
        assert wav.size > 0, self.items[idx][1]

        if "attn_file" in locals():
            attn = np.load(attn_file)

        if len(text) > self.max_seq_len:
            # return a different sample if the phonemized
            # text is longer than the threshold
            # TODO: find a better fix
            return self.load_data(self.rescue_item_idx)

        pitch = None
        if self.compute_f0:
            pitch = self.pitch_extractor.load_or_compute_pitch(
                self.ap, wav_file, self.f0_cache_path)
            pitch = self.pitch_extractor.normalize_pitch(
                pitch.astype(np.float32))

        sample = {
            "raw_text": raw_text,
            "text": text,
            "wav": wav,
            "pitch": pitch,
            "attn": attn,
            "item_idx": self.items[idx][1],
            "speaker_name": speaker_name,
            "language_name": language_name,
            "wav_file_name": os.path.basename(wav_file),
        }
        return sample