Ejemplo n.º 1
0
def test(sigma, batch_size, seed, checkpoint_path):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda().eval()

    # Load checkpoint if one exists
    model, iteration = load_checkpoint(checkpoint_path, model)

    model.eval()

    testset = Mel2Samp(data_config['testing_files'],
                       data_config['segment_length'],
                       data_config['filter_length'], data_config['hop_length'],
                       data_config['win_length'], data_config['sampling_rate'],
                       data_config['mel_fmin'], data_config['mel_fmax'])
    test_loader = DataLoader(testset,
                             num_workers=1,
                             shuffle=False,
                             sampler=None,
                             batch_size=batch_size,
                             pin_memory=False,
                             drop_last=True)

    with torch.no_grad():
        val_loss = 0.0
        for j, batch in enumerate(test_loader):
            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            val_loss += loss.item()
        val_loss = val_loss / (j + 1)
        model.train()

        print("test loss: {}:\t{:.9f}".format(iteration, val_loss))
Ejemplo n.º 2
0
def waveglow_infer(mel, config):
    print(
        colored('Running WaveGlow with ', 'blue', attrs=['bold']) +
        config.vocoder_path)

    waveglow = WaveGlow(config)
    waveglow, _, _ = load_checkpoint(config.vocoder_path, waveglow)

    #waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow = set_device(waveglow, config.device)
    waveglow.eval()

    denoiser = Denoiser(waveglow, config)
    denoiser = set_device(denoiser, config.device)

    with torch.no_grad():
        wave = waveglow.infer(mel, config.sigma).float()
        wave = denoiser(wave, strength=config.denoising_strength)

    wave = wave / torch.max(torch.abs(wave))

    return wave.cpu()
Ejemplo n.º 3
0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    torch.manual_seed(hp.seed)
    torch.cuda.manual_seed(hp.seed)
    model = WaveGlow().cuda()
    checkpoint = torch.load('test/TTSglow_130000')
    model.load_state_dict(checkpoint['model'].state_dict())

    dataset = FastSpeechDataset()
    testing_loader = DataLoader(dataset,
                                batch_size=1,
                                shuffle=False,
                                collate_fn=collate_fn,
                                drop_last=True,
                                num_workers=4)
    model = model.eval()

    for i, data_of_batch in enumerate(testing_loader):
        src_seq = data_of_batch["texts"]
        src_pos = data_of_batch["pos"]

        src_seq = torch.from_numpy(src_seq).long().to(device)
        src_pos = torch.from_numpy(src_pos).long().to(device)

        mel = model.inference(src_seq, src_pos, sigma=1.0, alpha=1.0)
        mel = mel.squeeze()
        print(mel.size())
        mel_path = os.path.join("results", "{}_synthesis.pt".format(i))
        torch.save(mel, mel_path)
        plot_data([mel.cpu().numpy().T], i)
        if i > 10:
Ejemplo n.º 4
0
def train(num_gpus,
          rank,
          group_name,
          output_directory,
          epochs,
          learning_rate,
          sigma,
          iters_per_checkpoint,
          batch_size,
          seed,
          fp16_run,
          checkpoint_path,
          with_tensorboard,
          num_workers=2):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    # HACK: setup separate training and eval sets
    training_files = data_config['training_files']
    eval_files = data_config['eval_files']
    del data_config['training_files']
    del data_config['eval_files']
    data_config['audio_files'] = training_files
    trainset = Mel2Samp(**data_config)
    data_config['audio_files'] = eval_files
    evalset = Mel2Samp(**data_config)

    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======

    print("Creating dataloaders with " + str(num_workers) + " workers")
    train_loader = DataLoader(trainset,
                              num_workers=num_workers,
                              shuffle=True,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)
    eval_loader = DataLoader(evalset,
                             num_workers=num_workers,
                             shuffle=True,
                             sampler=eval_sampler,
                             batch_size=batch_size,
                             pin_memory=False,
                             drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger_train = SummaryWriter(
            os.path.join(output_directory, 'logs', 'train'))
        logger_eval = SummaryWriter(
            os.path.join(output_directory, 'logs', 'eval'))

    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        model.train()
        with tqdm(total=len(train_loader)) as train_pbar:
            for i, batch in enumerate(train_loader):
                model.zero_grad()

                mel, audio = batch
                mel = torch.autograd.Variable(mel.cuda())
                audio = torch.autograd.Variable(audio.cuda())
                outputs = model((mel, audio))

                loss = criterion(outputs)
                if num_gpus > 1:
                    reduced_loss = reduce_tensor(loss.data, num_gpus).item()
                else:
                    reduced_loss = loss.item()

                if fp16_run:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                optimizer.step()

                train_pbar.set_description(
                    "Epoch {} Iter {} Loss {:.3f}".format(
                        epoch, iteration, reduced_loss))
                if with_tensorboard and rank == 0 and iteration % 10 == 0:
                    logger_train.add_scalar('loss', reduced_loss,
                                            i + len(train_loader) * epoch)
                    # adding logging for GPU utilization and memory usage
                    gpu_memory_used, gpu_utilization = get_gpu_stats()
                    k = 'gpu' + str(0)
                    logger_train.add_scalar(k + '/memory', gpu_memory_used,
                                            iteration)
                    logger_train.add_scalar(k + '/load', gpu_utilization,
                                            iteration)
                    logger_train.flush()

                if (iteration % iters_per_checkpoint == 0):
                    if rank == 0:
                        checkpoint_path = "{}/waveglow_{}".format(
                            output_directory, iteration)
                        save_checkpoint(model, optimizer, learning_rate,
                                        iteration, checkpoint_path)

                iteration += 1
                train_pbar.update(1)

        # Eval
        model.eval()
        torch.cuda.empty_cache()

        with torch.no_grad():
            tensorboard_mel, tensorboard_audio = None, None
            loss_accum = []
            with tqdm(total=len(eval_loader)) as eval_pbar:
                for i, batch in enumerate(eval_loader):
                    model.zero_grad()
                    mel, audio = batch
                    mel = torch.autograd.Variable(mel.cuda())
                    audio = torch.autograd.Variable(audio.cuda())
                    outputs = model((mel, audio))
                    loss = criterion(outputs).item()
                    loss_accum.append(loss)
                    eval_pbar.set_description("Epoch {} Eval {:.3f}".format(
                        epoch, loss))
                    outputs = None

                    # use the first batch for tensorboard audio samples
                    if i == 0:
                        tensorboard_mel = mel
                        tensorboard_audio = audio
                    eval_pbar.update(1)

            if with_tensorboard and rank == 0:
                loss_avg = statistics.mean(loss_accum)
                tqdm.write("Epoch {} Eval AVG {}".format(epoch, loss_avg))
                logger_eval.add_scalar('loss', loss_avg, iteration)

            # log audio samples to tensorboard
            tensorboard_audio_generated = model.infer(tensorboard_mel)
            for i in range(0, 5):
                ta = tensorboard_audio[i].cpu().numpy()
                tag = tensorboard_audio_generated[i].cpu().numpy()
                logger_eval.add_audio("sample " + str(i) + "/orig",
                                      ta,
                                      epoch,
                                      sample_rate=data_config['sampling_rate'])
                logger_eval.add_audio("sample " + str(i) + "/gen",
                                      tag,
                                      epoch,
                                      sample_rate=data_config['sampling_rate'])
            logger_eval.flush()
Ejemplo n.º 5
0
class TTSModel(object):
    """docstring for TTSModel."""
    def __init__(self, tacotron2_path, waveglow_path, **kwargs):
        super(TTSModel, self).__init__()
        hparams = HParams(**kwargs)
        self.hparams = hparams
        self.model = Tacotron2(hparams)
        if torch.cuda.is_available():
            self.model.load_state_dict(
                torch.load(tacotron2_path)["state_dict"])
            self.model.cuda().eval()
        else:
            self.model.load_state_dict(
                torch.load(tacotron2_path, map_location="cpu")["state_dict"])
            self.model.eval()
        self.k_cache = klepto.archives.file_archive(cached=False)
        if waveglow_path:
            if torch.cuda.is_available():
                wave_params = torch.load(waveglow_path)
            else:
                wave_params = torch.load(waveglow_path, map_location="cpu")
            try:
                self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
                self.waveglow.load_state_dict(wave_params)
            except:
                self.waveglow = wave_params["model"]
                self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
            if torch.cuda.is_available():
                self.waveglow.cuda().eval()
            else:
                self.waveglow.eval()
            # workaround from
            # https://github.com/NVIDIA/waveglow/issues/127
            for m in self.waveglow.modules():
                if "Conv" in str(type(m)):
                    setattr(m, "padding_mode", "zeros")
            for k in self.waveglow.convinv:
                k.float().half()
            self.denoiser = Denoiser(self.waveglow,
                                     n_mel_channels=hparams.n_mel_channels)
            self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
                self._synth_speech)
        else:
            self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
                self._synth_speech_fast)
        self.taco_stft = TacotronSTFT(
            hparams.filter_length,
            hparams.hop_length,
            hparams.win_length,
            n_mel_channels=hparams.n_mel_channels,
            sampling_rate=hparams.sampling_rate,
            mel_fmax=4000,
        )

    def _generate_mel_postnet(self, text):
        sequence = np.array(text_to_sequence(text,
                                             ["english_cleaners"]))[None, :]
        if torch.cuda.is_available():
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).cuda().long()
        else:
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).long()
        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
                sequence)
        return mel_outputs_postnet

    def synth_speech_array(self, text, vocoder):
        mel_outputs_postnet = self._generate_mel_postnet(text)

        if vocoder == VOCODER_WAVEGLOW:
            with torch.no_grad():
                audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
                audio_t = self.denoiser(audio_t, 0.1)[0]
            audio = audio_t[0].data
        elif vocoder == VOCODER_GL:
            mel_decompress = self.taco_stft.spectral_de_normalize(
                mel_outputs_postnet)
            mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
            spec_from_mel_scaling = 1000
            spec_from_mel = torch.mm(mel_decompress[0],
                                     self.taco_stft.mel_basis)
            spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
            spec_from_mel = spec_from_mel * spec_from_mel_scaling
            spec_from_mel = (spec_from_mel.cuda()
                             if torch.cuda.is_available() else spec_from_mel)
            audio = griffin_lim(
                torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                self.taco_stft.stft_fn,
                GL_ITERS,
            )
            audio = audio.squeeze()
        else:
            raise ValueError("vocoder arg should be one of [wavglow|gl]")
        audio = audio.cpu().numpy()
        return audio

    def _synth_speech(self,
                      text,
                      speed: float = 1.0,
                      sample_rate: int = OUTPUT_SAMPLE_RATE):
        audio = self.synth_speech_array(text, VOCODER_WAVEGLOW)

        return postprocess_audio(
            audio,
            src_rate=self.hparams.sampling_rate,
            dst_rate=sample_rate,
            tempo=speed,
        )

    def _synth_speech_fast(self,
                           text,
                           speed: float = 1.0,
                           sample_rate: int = OUTPUT_SAMPLE_RATE):
        audio = self.synth_speech_array(text, VOCODER_GL)

        return postprocess_audio(
            audio,
            tempo=speed,
            src_rate=self.hparams.sampling_rate,
            dst_rate=sample_rate,
        )
Ejemplo n.º 6
0
def train(
    num_gpus,
    rank,
    group_name,
    output_directory,
    epochs,
    learning_rate,
    sigma,
    iters_per_checkpoint,
    batch_size,
    seed,
    fp16_run,
    checkpoint_path,
    with_tensorboard,
):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    # =====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    # =====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp

        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(
        trainset,
        num_workers=1,
        shuffle=False,
        sampler=train_sampler,
        batch_size=batch_size,
        pin_memory=False,
        drop_last=True,
    )

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter

        logger = SummaryWriter(os.path.join(output_directory, "logs"))

    # fixed for visualization
    real_mels, real_audios = zip(*[trainset[i] for i in range(8)])
    real_mel = torch.cat(real_mels, dim=-1)
    real_audio = torch.cat(real_audios, dim=0)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if with_tensorboard and rank == 0:
                step = i + len(train_loader) * epoch
                logger.add_scalar("training_loss", reduced_loss, step)
                if step % 500 == 0:
                    # select the first eight data sample

                    model.eval()
                    with torch.no_grad():
                        device = mel.device
                        fake_audio = (model.infer(
                            torch.stack(real_mels).to(device)).flatten(
                                0, 1).cpu())
                    model.train()
                    fake_mel = trainset.get_mel(fake_audio)

                    logger.add_image(
                        "training_mel_real",
                        plot_spectrogram_to_numpy(real_mel),
                        step,
                        dataformats="HWC",
                    )
                    logger.add_audio(
                        "training_audio_real",
                        real_audio,
                        step,
                        22050,
                    )
                    logger.add_image(
                        "training_mel_fake",
                        plot_spectrogram_to_numpy(fake_mel),
                        step,
                        dataformats="HWC",
                    )
                    logger.add_audio(
                        "training_audio_fake",
                        fake_audio,
                        step,
                        22050,
                    )
                    logger.flush()

            if iteration % iters_per_checkpoint == 0:
                if rank == 0:
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1