Ejemplo n.º 1
0
def generate(model, text):

    # Text to index sequence
    cleaner_names = [x.strip() for x in hp.cleaners.split(',')]
    seq = np.expand_dims(np.asarray(text_to_sequence(text, cleaner_names),
                                    dtype=np.int32),
                         axis=0)

    # Provide [GO] Frame
    mel_input = np.zeros([seq.shape[0], hp.num_mels, 1], dtype=np.float32)

    # Variables
    characters = Variable(torch.from_numpy(seq).type(torch.cuda.LongTensor),
                          volatile=True).cuda()
    mel_input = Variable(torch.from_numpy(mel_input).type(
        torch.cuda.FloatTensor),
                         volatile=True).cuda()

    # Spectrogram to wav
    _, linear_output = model.forward(characters, mel_input)
    wav = inv_spectrogram(linear_output[0].data.cpu().numpy())
    wav = wav[:find_endpoint(wav)]
    out = io.BytesIO()
    save_wav(wav, out)

    return out.getvalue()
Ejemplo n.º 2
0
def generate(model, text, device, writer, curr, _tt):

    # Text to index sequence
    cleaner_names = [x.strip() for x in hp.cleaners.split(',')]
    seq = np.expand_dims(np.asarray(text_to_sequence(text), dtype=np.int32),
                         axis=0)

    # Provide [GO] Frame
    mel_input = np.zeros([seq.shape[0], hp.num_mels, 1], dtype=np.float32)

    # Variables
    characters = torch.from_numpy(seq).type(torch.cuda.LongTensor).to(device)
    mel_input = torch.from_numpy(mel_input).type(
        torch.cuda.FloatTensor).to(device)
    mel_input = torch.transpose(mel_input, 1, 2)

    # Spectrogram to wav
    _, linear_output = model(characters, mel_input, False)
    linear_output = torch.transpose(linear_output, 1, 2)
    wav = inv_spectrogram(linear_output[0].data.cpu().numpy())
    wav = wav[:find_endpoint(wav)]

    wav_tensor = wav * 1.0 / max(0.01, np.max(np.abs(wav)))
    wav_tensor = torch.Tensor(wav).to(device).view(1, -1)

    writer.add_audio('audio_result_%02d' % (_tt), wav_tensor, curr,
                     hp.sample_rate)

    out = io.BytesIO()
    save_wav(wav, out)

    return out.getvalue()
Ejemplo n.º 3
0
    def generate_audio(self):
        print('Generating Audio Samples')
        # Text to index sequence
        characters = []
        for text in self.sentences:
            text = np.asarray(text_to_sequence(text, [hp.cleaners]),
                              dtype=np.int32)
            characters.append(text)

        characters = _prepare_data(characters).astype(np.int32)
        characters = torch.from_numpy(characters).long().to(self.args.device)

        # Provide [GO] Frame
        mel_input = torch.zeros([characters.shape[0], hp.num_mels, 1],
                                dtype=torch.float).to(self.args.device)

        print('char: ', characters.shape)
        print('mel input: ', mel_input.shape)

        self.model.eval()
        # Spectrogram to wav
        _, linear_output = self.model(characters, mel_input)

        for i in range(linear_output.shape[0]):
            wav = inv_spectrogram(linear_output[i].data.cpu().numpy())
            wav = wav[:find_endpoint(wav)].astype(np.float32)
            print('wav: ', wav.shape)
            print('wav max: ', wav.max())
            print('wav min: ', wav.min())
            print('wav: ', wav.dtype)
            self.writer.add_audio('audio', wav, self.epoch, sample_rate=16000)
Ejemplo n.º 4
0
def main(args):

    device = torch.device('cuda:0')

    if 'english' in hp.cleaners:
        _symbols = en_symbols

    elif 'korean' in hp.cleaners:
        _symbols = symbols

    model = Tacotron(len(_symbols)).to(device)

    checkpoint = torch.load(args.checkpoint_path)
    model.load_state_dict(checkpoint['model'])

    model = model.eval()

    sentences = [
        'Scientists at the CERN laboratory say they have discovered a new particle.',
        'President Trump met with other leaders at the Group of 20 conference.',
        'Generative adversarial network or variational auto-encoder.',
        'Does the quick brown fox jump over the lazy dog?'
    ]

    # Text to index sequence

    for i, ele in enumerate(sentences):
        cleaner_names = [x.strip() for x in hp.cleaners.split(',')]
        seq = np.expand_dims(np.asarray(text_to_sequence(ele), dtype=np.int32),
                             axis=0)

        # Provide [GO] Frame
        mel_input = np.zeros([seq.shape[0], hp.num_mels, 1], dtype=np.float32)

        # Variables
        characters = torch.from_numpy(seq).type(
            torch.cuda.LongTensor).to(device)
        mel_input = torch.from_numpy(mel_input).type(
            torch.cuda.FloatTensor).to(device)
        mel_input = torch.transpose(mel_input, 1, 2)

        # Spectrogram to wav
        mel_output, linear_output = model(characters, mel_input, False)

        linear_output = torch.transpose(linear_output, 1, 2)
        wav = inv_spectrogram(linear_output[0].data.cpu().numpy())
        _wav = wav[:find_endpoint(wav)]
        out = io.BytesIO()
        save_wav(_wav, out)

        tt = out.getvalue()

        f = open('./result_%02d.wav' % i, 'wb')
        f.write(tt)
        f.close()