Esempio n. 1
0
    def __getitem__(self, idx):
        name = self.metas.iloc[idx, 0]
        path = "{}/wavs/{}.wav".format(self.root_path, name)

        # Text normalization
        text = self.metas.iloc[idx, 1]
        text_norm = self.metas.iloc[idx, 2]
        text_encoded = np.array(text_to_sequence(text_norm, self.text_cleaner))
        text_pos = np.array([idx + 1 for idx, _ in enumerate(text_encoded)])

        data = {
            "name": name,
            "text": text,
            "text_norm": text_norm,
            "text_encoded": text_encoded,
            "text_pos": text_pos,
            "text_len": text_encoded.shape[-1],
            "sr": self.sr
        }

        if not self.exclude_mels:
            wav, sr = librosa.load(path, sr=self.sr)  # wav is [-1.0, 1.0]
            if sr != self.sr:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sr, self.sr))

            # Audio processing
            wav, _ = librosa.effects.trim(wav,
                                          frame_length=self.win_len,
                                          hop_length=self.hop_len)

            if self.mels_path:
                mel = np.load(os.path.join(self.mels_path, name + ".mel.npy"))
            else:
                mel = librosa.feature.melspectrogram(wav,
                                                     sr=sr,
                                                     n_fft=self.n_fft,
                                                     win_length=self.win_len,
                                                     hop_length=self.hop_len,
                                                     n_mels=self.n_mels,
                                                     fmin=self.mel_fmin,
                                                     fmax=self.mel_fmax,
                                                     power=1.0)
                mel = audio.dynamic_range_compression(mel)

            data_mel = {
                "wav": wav,
                "mel": mel,
                "mel_len": mel.shape[-1],
            }
            data.update(data_mel)

        if self.aligns_path:
            aligns = np.load(
                os.path.join(self.aligns_path, name + ".align.npy"))
            data['align'] = aligns

        return data
    def __getitem__(self, idx):
        text = self.texts[idx]

        # Text normalization
        text_encoded = np.array(text_to_sequence(text, self.text_cleaner))
        text_pos = np.array([idx + 1 for idx, _ in enumerate(text_encoded)])

        data = {
            "text": text,
            "text_norm": text,
            "text_encoded": text_encoded,
            "text_pos": text_pos,
        }

        return data
Esempio n. 3
0
def synthesize(text, voice, sigma=0.6, denoiser_strength=0.1, is_fp16=False):

    hparams = create_hparams()
    hparams.sampling_rate = 22050

    if voice == "papaito":
        voice_model = "nvidia_tacotron2_papaito_300"
    elif voice == "constantino":
        voice_model = "tacotron2_Constantino_600"
    elif voice == "orador":
        voice_model = "checkpoint_tacotron2_29000_es"
   
    checkpoint_path = "/home/debian/workspace/models/" + voice_model

    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval().half()

    waveglow_path = '/home/debian/workspace/models/waveglow_256channels_ljs_v2.pt'
    waveglow = torch.load(waveglow_path, map_location='cuda')['model']
    _ = waveglow.cuda().eval().half()
    denoiser = Denoiser(waveglow)

    #text="¡Cágate lorito!"
    #with open(filelist_path, encoding='utf-8', mode='r') as f:
    #    text = f.read()

    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    #mel = torch.unsqueeze(mel, 0)
    mel = mel_outputs.half() if is_fp16 else mel_outputs
    audio = np.array([])
    with torch.no_grad():
        audio = waveglow.infer(mel, sigma=sigma)
        if denoiser_strength > 0:
             audio = denoiser(audio, denoiser_strength)
        audio = audio * hparams.max_wav_value
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')

    return audio, hparams.sampling_rate
Esempio n. 4
0
#checkpoint_path = "output/checkpoint_29000"
checkpoint_path = args.checkpoint_path
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

#waveglow_path = '/media/debian/SSD_USB/models/waveglow_256channels_ljs_v2.pt'
waveglow = torch.load(args.waveglow_path)['model']
_ = waveglow.cuda().eval().half()
denoiser = Denoiser(waveglow)

#text="¡Cágate lorito!"
with open(args.filelist_path, encoding='utf-8', mode='r') as f:
    text = f.read()

sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()

mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
#mel = torch.unsqueeze(mel, 0)
mel = mel_outputs.half() if args.is_fp16 else mel_outputs
with torch.no_grad():
    audio = waveglow.infer(mel, sigma=args.sigma)
    if args.denoiser_strength > 0:
        audio = denoiser(audio, args.denoiser_strength)
    audio = audio * MAX_WAV_VALUE
    audio = audio.squeeze()
    audio = audio.cpu().numpy()
    audio = audio.astype('int16')
    file_name = "audio"
    audio_path = os.path.join(args.output_dir,