def compute_input_seq(self, num_workers=0): """compute input sequences separately. Call it before passing dataset to data loader.""" if not self.use_phonemes: if self.verbose: print(" | > Computing input sequences ...") for idx, item in enumerate(tqdm.tqdm(self.items)): text, *_ = item sequence = np.asarray(text_to_sequence( text, [self.cleaners], tp=self.tp, add_blank=self.add_blank), dtype=np.int32) self.items[idx][0] = sequence else: func_args = [ self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank ] if self.verbose: print(" | > Computing phonemes ...") if num_workers == 0: for idx, item in enumerate(tqdm.tqdm(self.items)): phonemes = self._phoneme_worker([item, func_args]) self.items[idx][0] = phonemes else: with Pool(num_workers) as p: phonemes = list( tqdm.tqdm(p.imap(MyDataset._phoneme_worker, [[item, func_args] for item in self.items]), total=len(self.items))) for idx, p in enumerate(phonemes): self.items[idx][0] = p
def load_data(self, idx): item = self.items[idx] if len(item) == 4: text, wav_file, speaker_name, attn_file = item else: text, wav_file, speaker_name = item attn = None wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) if self.use_phonemes: text = self._load_or_generate_phoneme_sequence(wav_file, text) else: text = np.asarray(text_to_sequence(text, [self.cleaners], tp=self.tp, add_blank=self.add_blank), dtype=np.int32) assert text.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1] if "attn_file" in locals(): attn = np.load(attn_file) sample = { 'text': text, 'wav': wav, 'attn': attn, 'item_idx': self.items[idx][1], 'speaker_name': speaker_name, 'wav_file_name': os.path.basename(wav_file) } return sample
def load_data(self, idx): item = self.items[idx] if len(item) == 4: text, wav_file, speaker_name, attn_file = item else: text, wav_file, speaker_name = item attn = None wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) # apply noise for augmentation if self.use_noise_augment: wav = wav + (1.0 / 32768.0) * np.random.rand(*wav.shape) if not self.input_seq_computed: if self.use_phonemes: text = self._load_or_generate_phoneme_sequence( wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank, ) else: text = np.asarray(text_to_sequence(text, [self.cleaners], tp=self.tp, add_blank=self.add_blank), dtype=np.int32) assert text.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1] if "attn_file" in locals(): attn = np.load(attn_file) if len(text) > self.max_seq_len: # return a different sample if the phonemized # text is longer than the threshold # TODO: find a better fix return self.load_data(100) sample = { "text": text, "wav": wav, "attn": attn, "item_idx": self.items[idx][1], "speaker_name": speaker_name, "wav_file_name": os.path.basename(wav_file), } return sample
def compute_input_seq(self, num_workers=0): """Compute the input sequences with multi-processing. Call it before passing dataset to the data loader to cache the input sequences for faster data loading.""" if not self.use_phonemes: if self.verbose: print(" | > Computing input sequences ...") for idx, item in enumerate(tqdm.tqdm(self.items)): text, *_ = item sequence = np.asarray( text_to_sequence( text, [self.cleaners], custom_symbols=self.custom_symbols, tp=self.characters, add_blank=self.add_blank, ), dtype=np.int32, ) self.items[idx][0] = sequence else: func_args = [ self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.custom_symbols, self.characters, self.add_blank, ] if self.verbose: print(" | > Computing phonemes ...") if num_workers == 0: for idx, item in enumerate(tqdm.tqdm(self.items)): phonemes = self._phoneme_worker([item, func_args]) self.items[idx][0] = phonemes else: with Pool(num_workers) as p: phonemes = list( tqdm.tqdm( p.imap(TTSDataset._phoneme_worker, [[item, func_args] for item in self.items]), total=len(self.items), )) for idx, p in enumerate(phonemes): self.items[idx][0] = p
def load_data(self, idx): text, wav_file, speaker_name = self.items[idx] wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) if self.use_phonemes: text = self._load_or_generate_phoneme_sequence(wav_file, text) else: text = np.asarray(text_to_sequence(text, [self.cleaners], tp=self.tp), dtype=np.int32) assert text.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1] sample = { 'text': text, 'wav': wav, 'item_idx': self.items[idx][1], 'speaker_name': speaker_name } return sample
def load_data(self, idx): item = self.items[idx] if len(item) == 5: text, wav_file, speaker_name, language_name, attn_file = item else: text, wav_file, speaker_name, language_name = item attn = None raw_text = text wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) # apply noise for augmentation if self.use_noise_augment: wav = wav + (1.0 / 32768.0) * np.random.rand(*wav.shape) if not self.input_seq_computed: if self.use_phonemes: text = self._load_or_generate_phoneme_sequence( wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, language_name if language_name else self.phoneme_language, self.custom_symbols, self.characters, self.add_blank, ) else: text = np.asarray( text_to_sequence( text, [self.cleaners], custom_symbols=self.custom_symbols, tp=self.characters, add_blank=self.add_blank, ), dtype=np.int32, ) assert text.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1] if "attn_file" in locals(): attn = np.load(attn_file) if len(text) > self.max_seq_len: # return a different sample if the phonemized # text is longer than the threshold # TODO: find a better fix return self.load_data(self.rescue_item_idx) pitch = None if self.compute_f0: pitch = self.pitch_extractor.load_or_compute_pitch( self.ap, wav_file, self.f0_cache_path) pitch = self.pitch_extractor.normalize_pitch( pitch.astype(np.float32)) sample = { "raw_text": raw_text, "text": text, "wav": wav, "pitch": pitch, "attn": attn, "item_idx": self.items[idx][1], "speaker_name": speaker_name, "language_name": language_name, "wav_file_name": os.path.basename(wav_file), } return sample