Exemple #1
0
    def __call__(self, text, denoise=True):
        """
        inference only for now
        args:
            text: The text to convert
            visualize: Whether to display intermediary results, the mel spectograms
            denoise: whether to reduce the waveglow bias to denoise the audio
        """

        with torch.no_grad():
            sequence = np.array(text_to_sequence(
                text, ['english_cleaners']))[None, :]
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).long()

            if self.device.type == "cuda":
                sequence = sequence.cuda()

            mel_outputs, mel_outputs_postnet, _, alignments = self.tacotron.inference(
                sequence)

            audio = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)

            if denoise:
                audio = self.denoiser(audio, strength=0.01)[:, 0]

            return audio, mel_outputs, mel_outputs_postnet, alignments
Exemple #2
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    model = load_and_setup_model(parser, args)

    log_hardware()
    log_args(args)

    if args.include_warmup:
        sequences = torch.randint(low=0, high=148, size=(1,50), dtype=torch.long).cuda()
        text_lengths = torch.IntTensor([sequence.size(1)]).cuda().long()
        for i in range(3):
            with torch.no_grad():
                _, mels, _, _, mel_lengths = model.infer(sequences, text_lengths)

    os.makedirs(args.output, exist_ok=True)

    LOGGER.iteration_start()

    measurements = {}

    anchor_dirs = [os.path.join(args.dataset_path, anchor) for anchor in args.anchor_dirs]
    metadatas = [load_metadata(anchor) for anchor in anchor_dirs]
    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        for speaker_id in range(len(anchor_dirs)):
            metadata = metadatas[speaker_id]
            for mel_path, text in tqdm(metadata):
               seq = text_to_sequence(text, speaker_id, ['basic_cleaners'])
               seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0)
               seq_lens = torch.IntTensor([len(text)])
               melspec = torch.from_numpy(np.load(mel_path))
               target = melspec[:, ::args.reduction_factor]
               targets = torch.from_numpy(np.stack(target)).unsqueeze(0)
               target_lengths = torch.IntTensor([target.shape[1]])
               inputs = (to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).float(), to_gpu(target_lengths).int())
               _, mel_outs, _, _ = model(inputs)
               fname = os.path.basename(mel_path)
               np.save(os.path.join(args.output, fname), mel_outs[0, :, :melspec.shape[1]], allow_pickle=False)

    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
    LOGGER.log(key="latency", value=(measurements['tacotron2_time']))
    LOGGER.iteration_stop()
    LOGGER.finish()
Exemple #3
0
def prepare_input_sequence(texts, speaker_id):
    sequences = [text_to_sequence(text, speaker_id, ['basic_cleaners'])[:] for text in texts]
    texts, text_lengths, ids_sorted_decreasing = pad_sequences(sequences)

    if torch.cuda.is_available():
        texts = texts.cuda().long()
        text_lengths = text_lengths.cuda().int()
    else:
        texts = texts.long()
        text_lengths = text_lengths.int()

    return texts, text_lengths, ids_sorted_decreasing
def mapper(line):
    fp, text, _ = line.strip().split('|')

    seq = text_to_sequence(text, ['english_cleaners'])

    if os.path.isfile(fp):
        with audioread.audio_open(fp) as f:
            duration = f.duration
    else:
        duration = None

    return fp, len(seq), duration
Exemple #5
0
def prepare_input_sequence(texts):

    d = []
    for i, text in enumerate(texts):
        d.append(
            torch.IntTensor(text_to_sequence(text, ['english_cleaners'])[:]))

    text_padded, input_lengths = pad_sequences(d)
    if torch.cuda.is_available():
        text_padded = torch.autograd.Variable(text_padded).cuda().long()
        input_lengths = torch.autograd.Variable(input_lengths).cuda().long()
    else:
        text_padded = torch.autograd.Variable(text_padded).long()
        input_lengths = torch.autograd.Variable(input_lengths).long()

    return text_padded, input_lengths
Exemple #6
0
def prepare_input_sequence(texts, cpu_run=False):

    d = []
    for i, text in enumerate(texts):
        d.append(
            torch.IntTensor(text_to_sequence(text, ['english_cleaners'])[:]))

    text_padded, input_lengths = pad_sequences(d)
    if not cpu_run:
        text_padded = text_padded.cuda().long()
        input_lengths = input_lengths.cuda().long()
    else:
        text_padded = text_padded.long()
        input_lengths = input_lengths.long()

    return text_padded, input_lengths
Exemple #7
0
def get_mel_from_tacotron2(audiopath, text):

    audio, sampling_rate = load_wav_to_torch(audiopath)
    audio = audio.numpy()

    preprocessed_wav = encoder.preprocess_wav(audio, sampling_rate)
    embed = encoder.embed_utterance(preprocessed_wav)
    embed = torch.Tensor(embed).cuda()

    sequence = np.array(text_to_sequence(text))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
        sequence, embed)

    return mel_outputs, mel_outputs_postnet, alignments
Exemple #8
0
def prepare_input_sequence(texts, cpu_run=False):

    d = []
    for i, text in enumerate(texts):
        d.append(
            torch.IntTensor(
                #TODO: eng or kor
                text_to_sequence(text, ['english_cleaners'])[:]))
        #text_to_sequence(text, ['transliteration_cleaners'])[:]))

    text_padded, input_lengths = pad_sequences(d)
    if torch.cuda.is_available() and not cpu_run:
        text_padded = torch.autograd.Variable(text_padded).cuda().long()
        input_lengths = torch.autograd.Variable(input_lengths).cuda().long()
    else:
        text_padded = torch.autograd.Variable(text_padded).long()
        input_lengths = torch.autograd.Variable(input_lengths).long()

    return text_padded, input_lengths
Exemple #9
0
def inference_mel(text, model):
    """"
    Performs conversion from text to mel spectogram
    """
    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    mel = torch.autograd.Variable(mel_outputs_postnet)
    mel = mel.reshape(80, mel.shape[2])
    mel = mel.data

    filename = "text_to_mel"
    mel = torch.save(mel, filename)

    file = open(str(filename) + ".txt", 'w')
    file.write(filename)
    file.close()

    return file.name
Exemple #10
0
 def get_text(self, text):
     text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
     return text_norm
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)

    log_hardware()
    log_args(args)

    # tacotron2 model filepath was specified
    if args.tacotron2:
        # Setup Tacotron2
        tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16_run)
    # file with mel spectrogram was specified
    elif args.mel_file:
        mel = torch.load(args.mel_file)
        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)

    # Setup WaveGlow
    if args.old_waveglow:
        waveglow = torch.load(args.waveglow)['model']
        waveglow = waveglow.remove_weightnorm(waveglow)
        waveglow = waveglow.cuda()
        waveglow.eval()
    else:
        waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16_run)

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file. Using default text.")
        texts = ["The forms of printed letters should be beautiful, and\
        that their arrangement on the page should be reasonable and\
        a help to the shapeliness of the letters themselves."]

    for i, text in enumerate(texts):

        LOGGER.iteration_start()

        sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
        sequence = torch.autograd.Variable(
            torch.from_numpy(sequence)).cuda().long()

        if args.tacotron2:
            tacotron2_t0 = time.time()
            with torch.no_grad():
                _, mel, _, _ = tacotron2.inference(sequence)
            tacotron2_t1 = time.time()
            tacotron2_infer_perf = sequence.size(1)/(tacotron2_t1-tacotron2_t0)
            LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf)

        waveglow_t0 = time.time()
        with torch.no_grad():
            audio = waveglow.infer(mel, sigma=args.sigma_infer)
            audio = audio.float()
        waveglow_t1 = time.time()
        waveglow_infer_perf = audio[0].size(0)/(waveglow_t1-waveglow_t0)

        audio_path = args.output + "audio_"+str(i)+".wav"
        write(audio_path, args.sampling_rate, audio[0].data.cpu().numpy())

        LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf)
        LOGGER.iteration_stop()

    LOGGER.finish()
Exemple #12
0
 def get_sequence(self, text, speaker_id):
     return text_to_sequence(text, speaker_id, self.text_cleaners)
Exemple #13
0
 def get_text(self, text):
     "function which maps input text to integer tensor list"
     text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
     return text_norm
Exemple #14
0
 def get_text(self, text):
     return text_to_sequence(text, self.text_cleaners)
Exemple #15
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_training_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_frames_per_sec",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("tacotron2_latency",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    model, args = load_and_setup_model(parser, args)

    log_hardware()
    log_args(args)

    os.makedirs(args.output_dir, exist_ok=True)

    LOGGER.iteration_start()

    measurements = {}

    anchor_dirs = [
        os.path.join(args.dataset_path, anchor)
        for anchor in args.training_anchor_dirs
    ]
    metadatas = [load_metadata(anchor) for anchor in anchor_dirs]
    stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length,
                        args.n_mel_channels, args.sampling_rate, args.mel_fmin,
                        args.mel_fmax)
    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        for speaker_id in range(len(anchor_dirs)):
            metadata = metadatas[speaker_id]
            for npy_path, text in tqdm(metadata):
                seq = text_to_sequence(text, speaker_id, ['basic_cleaners'])
                seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0)
                seq_lens = torch.IntTensor([len(text)])
                wav = load_wav_to_torch(npy_path)
                mel = stft.mel_spectrogram(wav.unsqueeze(0))
                mel = mel.squeeze()
                max_target_len = mel.size(1) - 1
                max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step
                padded_mel = np.pad(mel, [(0, 0),
                                          (0, max_target_len - mel.size(1))],
                                    mode='constant',
                                    constant_values=args.mel_pad_val)
                target = padded_mel[:, ::args.n_frames_per_step]
                targets = torch.from_numpy(np.stack(target)).unsqueeze(0)
                target_lengths = torch.IntTensor([target.shape[1]])
                outputs = model.infer(
                    to_gpu(seqs).long(),
                    to_gpu(seq_lens).int(),
                    to_gpu(targets).half(),
                    to_gpu(target_lengths).int())
                _, mel_out, _, _ = [
                    output.cpu() for output in outputs if output is not None
                ]
                mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1]
                assert (mel_out.shape[-1] == wav.shape[-1] // args.hop_length)
                fname = os.path.basename(npy_path)
                np.save(os.path.join(args.output_dir, fname),
                        mel_out,
                        allow_pickle=False)
                # GTA synthesis
                # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze())
                # wav = griffin_lim(magnitudes, stft.stft_fn, 60)
                # save_wav(wav, os.path.join(args.output_dir, 'eval.wav'))

    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
    LOGGER.log(key="latency", value=(measurements['tacotron2_time']))
    LOGGER.iteration_stop()
    LOGGER.finish()
Exemple #16
0
                       origin='bottom',
                       interpolation='none')
        fig.savefig('data' + str(i) + '.png')
    plt.close(fig)


sys.path.append('./waveglow')

tacotron_path = 'output/checkpoint_Tacotron2_30'
taco_checkpoint = torch.load(tacotron_path, map_location='cpu')
state_dict = torch.load(tacotron_path)['state_dict']
t2 = models.get_model('Tacotron2', taco_checkpoint['config'], to_cuda=True)

text = "아들 진수가 살아 돌아온다"
# preprocessing
inputs = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :]
print(inputs)
inputs = torch.from_numpy(inputs).to(device='cuda', dtype=torch.int64)
#inputs = torch.from_numpy(np.array([bombom, kitkat], dtype=np.int64)).to(device='cuda', dtype=torch.int64)

#input_lengths = torch.IntTensor([inputs.size(1), inputs.size(1)]).cuda().long()
input_lengths = torch.IntTensor([inputs.size(1)]).cuda().long()
speaker_id = torch.IntTensor([0]).cuda().long()
embedded_speaker = t2.speakers_embedding(speaker_id)
print("speaker", embedded_speaker)

t2.load_state_dict(state_dict)
_ = t2.cuda().eval().half()

waveglow = torch.load('output/waveglow_128000')['model']
for m in waveglow.modules():
Exemple #17
0
 def get_text(self, text):
     text_norm = torch.IntTensor(text_to_sequence(text))
     return text_norm