def __getitem__(self, index): id = self._metadata[index][4].split(".")[0] x_ = self._metadata[index][3].split() if self.use_phonemes: x = phonemes_to_sequence(x_) else: x = text_to_sequence(x_, self.tts_cleaner_names, self.eos) mel = np.load(f"{self.path}mels/{id}.npy") durations = str_to_int_list(self._metadata[index][2]) e = remove_outlier( np.load(f"{self.path}energy/{id}.npy") ) # self._norm_mean_std(np.load(f'{self.path}energy/{id}.npy'), self.e_mean, self.e_std, True) p = remove_outlier( np.load(f"{self.path}pitch/{id}.npy") ) # self._norm_mean_std(np.load(f'{self.path}pitch/{id}.npy'), self.f0_mean, self.f0_std, True) mel_len = mel.shape[1] durations = durations[:len(x)] durations[-1] = durations[-1] + (mel.shape[1] - sum(durations)) assert mel.shape[1] == sum(durations) return ( np.array(x), mel.T, id, mel_len, np.array(durations), e, p, ) # Mel [T, num_mel]
def __getitem__(self, index): id = self._metadata[index][4].split(".")[0] x_ = self._metadata[index][3].split() if hp.use_phonemes: x = phonemes_to_sequence(x_) else: x = text_to_sequence(x_, hp.tts_cleaner_names) mel = np.load(f'{self.path}mels/{id}.npy') durations = str_to_int_list(self._metadata[index][2]) e = np.load(f'{self.path}energy/{id}.npy') p = np.load(f'{self.path}pitch/{id}.npy') mel_len = mel.shape[1] durations[-1] = durations[-1] + (mel.shape[1] - sum(durations)) return np.array(x), mel.T, id, mel_len, np.array(durations), e, p # Mel [T, num_mel]
def synthesis(args, text, hp): """Decode with E2E-TTS model.""" set_deterministic_pytorch(args) # read training config idim = hp.symbol_len odim = hp.num_mels model = FeedForwardTransformer(idim, odim, hp) print(model) if os.path.exists(args.path): print("\nSynthesis Session...\n") model.load_state_dict(torch.load(args.path), strict=False) else: print("Checkpoint not exixts") return None model.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) input = np.asarray(phonemes_to_sequence(text.split())) text = torch.LongTensor(input) text = text.cuda() # [num_char] with torch.no_grad(): # decode and write idx = input[:5] start_time = time.time() print("text :", text.size()) outs, probs, att_ws = model.inference(text, hp) print("Out size : ", outs.size()) logging.info("inference speed = %s msec / frame." % ((time.time() - start_time) / (int(outs.size(0)) * 1000))) if outs.size(0) == text.size(0) * args.maxlenratio: logging.warning("output length reaches maximum length .") print("mels", outs.size()) mel = outs.cpu().numpy() # [T_out, num_mel] print("numpy ", mel.shape) return mel
def synth(text, model, hp): """Decode with E2E-TTS model.""" print("TTS synthesis") model.eval() # set torch device device = torch.device("cuda" if hp.train.ngpu > 0 else "cpu") model = model.to(device) input = np.asarray(phonemes_to_sequence(text)) text = torch.LongTensor(input) text = text.to(device) with torch.no_grad(): print("predicting") outs = model.inference(text) # model(text) for jit script mel = outs return mel
def synthesis_tts(args, text, path): """Decode with E2E-TTS model.""" set_deterministic_pytorch(args) print("TTS synthesis") # read training config idim = hp.symbol_len odim = hp.num_mels print("Text :", text) input = np.asarray(phonemes_to_sequence(text.split())) print("Input :", input) model = FeedForwardTransformer(idim, odim) if os.path.exists(path): logging.info('\nSynthesis Session...\n') model.load_state_dict(torch.load(path), strict=False) else: logging.info("Checkpoint not exixts") return None model.eval() # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") model = model.to(device) text = torch.LongTensor(input) text = text.cuda() #[num_char] # define function for plot prob and att_ws def _plot_and_save(array, figname, figsize=(6, 4), dpi=150): import matplotlib.pyplot as plt shape = array.shape if len(shape) == 1: # for eos probability plt.figure(figsize=figsize, dpi=dpi) plt.plot(array) plt.xlabel("Frame") plt.ylabel("Probability") plt.ylim([0, 1]) elif len(shape) == 2: # for tacotron 2 attention weights, whose shape is (out_length, in_length) plt.figure(figsize=figsize, dpi=dpi) plt.imshow(array, aspect="auto") plt.xlabel("Input") plt.ylabel("Output") elif len(shape) == 4: # for transformer attention weights, whose shape is (#leyers, #heads, out_length, in_length) plt.figure(figsize=(figsize[0] * shape[0], figsize[1] * shape[1]), dpi=dpi) for idx1, xs in enumerate(array): for idx2, x in enumerate(xs, 1): plt.subplot(shape[0], shape[1], idx1 * shape[1] + idx2) plt.imshow(x, aspect="auto") plt.xlabel("Input") plt.ylabel("Output") else: raise NotImplementedError("Support only from 1D to 4D array.") plt.tight_layout() if not os.path.exists(os.path.dirname(figname)): # NOTE: exist_ok = True is needed for parallel process decoding os.makedirs(os.path.dirname(figname), exist_ok=True) plt.savefig(figname) plt.close() with torch.no_grad(): # decode and write idx = input[:5] start_time = time.time() print("predicting") outs, probs, att_ws = model.inference(text, args) logging.info("inference speed = %s msec / frame." % ( (time.time() - start_time) / (int(outs.size(0)) * 1000))) if outs.size(0) == text.size(0) * 5: logging.warning("output length reaches maximum length .") mel = outs#.cpu().numpy() # [T_out, num_mel] return mel