Esempio n. 1
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    model = load_and_setup_model(parser, args)

    log_hardware()
    log_args(args)

    if args.include_warmup:
        sequences = torch.randint(low=0, high=148, size=(1,50), dtype=torch.long).cuda()
        text_lengths = torch.IntTensor([sequence.size(1)]).cuda().long()
        for i in range(3):
            with torch.no_grad():
                _, mels, _, _, mel_lengths = model.infer(sequences, text_lengths)

    os.makedirs(args.output, exist_ok=True)

    LOGGER.iteration_start()

    measurements = {}

    anchor_dirs = [os.path.join(args.dataset_path, anchor) for anchor in args.anchor_dirs]
    metadatas = [load_metadata(anchor) for anchor in anchor_dirs]
    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        for speaker_id in range(len(anchor_dirs)):
            metadata = metadatas[speaker_id]
            for mel_path, text in tqdm(metadata):
               seq = text_to_sequence(text, speaker_id, ['basic_cleaners'])
               seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0)
               seq_lens = torch.IntTensor([len(text)])
               melspec = torch.from_numpy(np.load(mel_path))
               target = melspec[:, ::args.reduction_factor]
               targets = torch.from_numpy(np.stack(target)).unsqueeze(0)
               target_lengths = torch.IntTensor([target.shape[1]])
               inputs = (to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).float(), to_gpu(target_lengths).int())
               _, mel_outs, _, _ = model(inputs)
               fname = os.path.basename(mel_path)
               np.save(os.path.join(args.output, fname), mel_outs[0, :, :melspec.shape[1]], allow_pickle=False)

    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
    LOGGER.log(key="latency", value=(measurements['tacotron2_time']))
    LOGGER.iteration_stop()
    LOGGER.finish()
Esempio n. 2
0
 def __init__(self, args, anchor_dirs):
     self.speaker_num = len(anchor_dirs)
     self.meta_dirs = [
         os.path.join(args.dataset_path, anchor_dirs[i])
         for i in range(self.speaker_num)
     ]
     self.metadatas = [
         load_metadata(meta_dir) for meta_dir in self.meta_dirs
     ]
     self.offsets = [0] * self.speaker_num
     self.text_cleaners = args.text_cleaners
     self.sampling_rate = args.sampling_rate
     self.load_mel_from_disk = args.load_mel_from_disk
     self.stft = TacotronSTFT(args.filter_length, args.hop_length,
                              args.win_length, args.n_mel_channels,
                              args.sampling_rate, args.mel_fmin,
                              args.mel_fmax)
     random.seed(1234)
     for i in range(self.speaker_num):
         random.shuffle(self.metadatas[i])
Esempio n. 3
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_training_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_frames_per_sec",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("tacotron2_latency",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    model, args = load_and_setup_model(parser, args)

    log_hardware()
    log_args(args)

    os.makedirs(args.output_dir, exist_ok=True)

    LOGGER.iteration_start()

    measurements = {}

    anchor_dirs = [
        os.path.join(args.dataset_path, anchor)
        for anchor in args.training_anchor_dirs
    ]
    metadatas = [load_metadata(anchor) for anchor in anchor_dirs]
    stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length,
                        args.n_mel_channels, args.sampling_rate, args.mel_fmin,
                        args.mel_fmax)
    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        for speaker_id in range(len(anchor_dirs)):
            metadata = metadatas[speaker_id]
            for npy_path, text in tqdm(metadata):
                seq = text_to_sequence(text, speaker_id, ['basic_cleaners'])
                seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0)
                seq_lens = torch.IntTensor([len(text)])
                wav = load_wav_to_torch(npy_path)
                mel = stft.mel_spectrogram(wav.unsqueeze(0))
                mel = mel.squeeze()
                max_target_len = mel.size(1) - 1
                max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step
                padded_mel = np.pad(mel, [(0, 0),
                                          (0, max_target_len - mel.size(1))],
                                    mode='constant',
                                    constant_values=args.mel_pad_val)
                target = padded_mel[:, ::args.n_frames_per_step]
                targets = torch.from_numpy(np.stack(target)).unsqueeze(0)
                target_lengths = torch.IntTensor([target.shape[1]])
                outputs = model.infer(
                    to_gpu(seqs).long(),
                    to_gpu(seq_lens).int(),
                    to_gpu(targets).half(),
                    to_gpu(target_lengths).int())
                _, mel_out, _, _ = [
                    output.cpu() for output in outputs if output is not None
                ]
                mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1]
                assert (mel_out.shape[-1] == wav.shape[-1] // args.hop_length)
                fname = os.path.basename(npy_path)
                np.save(os.path.join(args.output_dir, fname),
                        mel_out,
                        allow_pickle=False)
                # GTA synthesis
                # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze())
                # wav = griffin_lim(magnitudes, stft.stft_fn, 60)
                # save_wav(wav, os.path.join(args.output_dir, 'eval.wav'))

    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
    LOGGER.log(key="latency", value=(measurements['tacotron2_time']))
    LOGGER.iteration_stop()
    LOGGER.finish()