def predict(tst_stn): if getattr(hps.data, "add_blank", False): text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'], cmu_dict) text_norm = commons.intersperse(text_norm, len(symbols)) else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality tst_stn = " " + tst_stn.strip() + " " text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'], cmu_dict) sequence = np.array(text_norm)[None, :] x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).long() x_tst_lengths = torch.tensor([x_tst.shape[1]]) with torch.no_grad(): noise_scale = .667 length_scale = 1.0 (y_gen_tst, *_), *_, (attn_gen, *_) = model(x_tst, x_tst_lengths, gen=True, noise_scale=noise_scale, length_scale=length_scale) audio = waveglow.infer(y_gen_tst, sigma=.666) audio = normalize_audio(audio[0].clamp(-1, 1).data.cpu().float().numpy()) return hps.data.sampling_rate, np.stack([audio, audio], 1)
def get_text(self, text): text_norm = text_to_sequence(text, self.text_cleaners, getattr(self, "cmudict", None)) if self.add_blank: text_norm = commons.intersperse(text_norm, len( symbols)) # add a blank token, whose id number is len(symbols) text_norm = torch.IntTensor(text_norm) return text_norm
def get_text(self, text): text_norm = self.cleaner(text) if self.add_blank: text_norm = commons.intersperse( text_norm, len(self.cleaner) ) # add a blank token, whose id number is len(symbols) text_norm = torch.IntTensor(text_norm) return text_norm
def get_text(self, text): if self.cleaned_text: text_norm = cleaned_text_to_sequence(text) else: text_norm = text_to_sequence(text, self.text_cleaners) if self.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm