Exemple #1
0
def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path,
          ground_truth: bool, save_every: int, backup_every: int,
          force_restart: bool):
    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    # Instantiate the model
    print("Initializing the model...")
    model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                    fc_dims=hp.voc_fc_dims,
                    bits=hp.bits,
                    pad=hp.voc_pad,
                    upsample_factors=hp.voc_upsample_factors,
                    feat_dims=hp.num_mels,
                    compute_dims=hp.voc_compute_dims,
                    res_out_dims=hp.voc_res_out_dims,
                    res_blocks=hp.voc_res_blocks,
                    hop_length=hp.hop_length,
                    sample_rate=hp.sample_rate,
                    mode=hp.voc_mode).cuda()

    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters())
    for p in optimizer.param_groups:
        p["lr"] = hp.voc_lr
    loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss

    # Load the weights
    model_dir = models_dir.joinpath(run_id)
    model_dir.mkdir(exist_ok=True)
    weights_fpath = model_dir.joinpath(run_id + ".pt")
    if force_restart or not weights_fpath.exists():
        print("\nStarting the training of WaveRNN from scratch\n")
        model.save(weights_fpath, optimizer)
    else:
        print("\nLoading weights at %s" % weights_fpath)
        model.load(weights_fpath, optimizer)
        print("WaveRNN weights loaded from step %d" % model.step)

    # Initialize the dataset
    metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \
        voc_dir.joinpath("synthesized.txt")
    mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath(
        "mels_gta")
    wav_dir = syn_dir.joinpath("audio")
    dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir)
    test_loader = DataLoader(dataset,
                             batch_size=1,
                             shuffle=True,
                             pin_memory=True)

    # Begin the training
    simple_table([('Batch size', hp.voc_batch_size), ('LR', hp.voc_lr),
                  ('Sequence Len', hp.voc_seq_len)])

    for epoch in range(1, 350):
        data_loader = DataLoader(dataset,
                                 collate_fn=collate_vocoder,
                                 batch_size=hp.voc_batch_size,
                                 num_workers=2,
                                 shuffle=True,
                                 pin_memory=True)
        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(data_loader, 1):
            x, m, y = x.cuda(), m.cuda(), y.cuda()

            # Forward pass
            y_hat = model(x, m)
            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
            elif model.mode == 'MOL':
                y = y.float()
            y = y.unsqueeze(-1)
            print("y shape:", y.shape)
            print("y_hat shape:", y_hat.shape)
            # Backward pass
            loss = loss_func(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            speed = i / (time.time() - start)
            avg_loss = running_loss / i

            step = model.get_step()
            k = step // 1000

            if backup_every != 0 and step % backup_every == 0:
                model.checkpoint(model_dir, optimizer)

            if save_every != 0 and step % save_every == 0:
                model.save(weights_fpath, optimizer)

            msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \
                f"Loss: {avg_loss:.4f} | {speed:.1f} " \
                f"steps/s | Step: {k}k | "
            stream(msg)

        gen_testset(model, test_loader, hp.voc_gen_at_checkpoint,
                    hp.voc_gen_batched, hp.voc_target, hp.voc_overlap,
                    model_dir)
        print("")
Exemple #2
0
def train(run_id='',
        syn_dir=None, voc_dirs=[], mel_dir_name='', models_dir=None, log_dir='',
        ground_truth=False,
        save_every=1000, backup_every=1000, log_every=1000,
        force_restart=False, total_epochs=10000, logger=None):
    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    # Instantiate the model
    print("Initializing the model...")
    model = WaveRNN(
        rnn_dims=hp.voc_rnn_dims, # 512
        fc_dims=hp.voc_fc_dims, # 512
        bits=hp.bits, # 9
        pad=hp.voc_pad, # 2
        upsample_factors=hp.voc_upsample_factors, # (3, 4, 5, 5) -> 300, (5,5,12)?
        feat_dims=hp.num_mels, # 80
        compute_dims=hp.voc_compute_dims, # 128
        res_out_dims=hp.voc_res_out_dims, # 128
        res_blocks=hp.voc_res_blocks, # 10
        hop_length=hp.hop_length, # 300
        sample_rate=hp.sample_rate, # 24000
        mode=hp.voc_mode # RAW (or MOL)
    ).cuda()

    # hp.apply_preemphasis in VocoderDataset
    # hp.mu_law in VocoderDataset
    # hp.voc_seq_len in VocoderDataset
    # hp.voc_lr in optimizer
    # hp.voc_batch_size for train

    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters())
    for p in optimizer.param_groups:
        p["lr"] = hp.voc_lr # 0.0001
    loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss

    # Load the weights
    model_dir = models_dir.joinpath(run_id) # gta_model/gtaxxxx
    model_dir.mkdir(exist_ok=True)
    weights_fpath = model_dir.joinpath(run_id + ".pt") # gta_model/gtaxxx/gtaxxx.pt
    if force_restart or not weights_fpath.exists():
        print("\nStarting the training of WaveRNN from scratch\n")
        model.save(str(weights_fpath), optimizer)
    else:
        print("\nLoading weights at %s" % weights_fpath)
        model.load(str(weights_fpath), optimizer)
        print("WaveRNN weights loaded from step %d" % model.step)

    # Initialize the dataset
    #metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \
    #    voc_dir.joinpath("synthesized.txt")
    #mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath("mels_gta")
    #wav_dir = syn_dir.joinpath("audio")
    #dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir)
    #dataset = VocoderDataset(str(voc_dir), 'mels-gta-1099579078086', 'audio')
    dataset = VocoderDataset([str(voc_dir) for voc_dir in voc_dirs], mel_dir_name, 'audio')
    #test_loader = DataLoader(dataset,
    #                         batch_size=1,
    #                         shuffle=True,
    #                         pin_memory=True)

    # Begin the training
    simple_table([('Batch size', hp.voc_batch_size),
                  ('LR', hp.voc_lr),
                  ('Sequence Len', hp.voc_seq_len)])

    for epoch in range(1, total_epochs):
        data_loader = DataLoader(dataset,
                                 collate_fn=collate_vocoder,
                                 batch_size=hp.voc_batch_size,
                                 num_workers=30,
                                 shuffle=True,
                                 pin_memory=True)
        start = time.time()
        running_loss = 0.

        # start from 1
        for i, (x, y, m) in enumerate(data_loader, 1):
            # cur [B, L], future [B, L] bit label, mels [B, D, T]
            x, m, y = x.cuda(), m.cuda(), y.cuda()

            # Forward pass
            # [B, L], [B, D, T] -> [B, L, C]
            y_hat = model(x, m)
            if model.mode == 'RAW':
                # [B, L, C] -> [B, C, L, 1]
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
            elif model.mode == 'MOL':
                y = y.float()
            # [B, L, 1]
            y = y.unsqueeze(-1)

            # Backward pass
            # [B, C, L, 1], [B, L, 1]
            # cross_entropy for RAW
            loss = loss_func(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            speed = i / (time.time() - start)
            avg_loss = running_loss / i

            step = model.get_step()
            k = step // 1000

            if backup_every != 0 and step % backup_every == 0 :
                model.checkpoint(str(model_dir), optimizer)

            if save_every != 0 and step % save_every == 0 :
                model.save(str(weights_fpath), optimizer)

            if log_every != 0 and step % log_every == 0 :
                logger.scalar_summary("loss", loss.item(), step)

            total_data=len(data_loader)

            speed=speed
            avg_loss=avg_loss
            k=k
            total_data=total_data
            msg = ("| Epoch: {epoch} ({i}/{total_data}) | " +\
                "Loss: {avg_loss:.4f} | {speed:.1f} " +\
                "steps/s | Step: {k}k | ").format(**vars())
            stream(msg)


        #gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,
        #            hp.voc_target, hp.voc_overlap, model_dir)
        print("")
Exemple #3
0
def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path,
          ground_truth: bool, save_every: int, backup_every: int,
          force_restart: bool):
    # Check to make sure the hop length is correctly factorised
    # assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    # Instantiate the model
    print("Initializing the model...")
    # model = WaveRNN(
    #     rnn_dims=hp.voc_rnn_dims,
    #     fc_dims=hp.voc_fc_dims,
    #     bits=hp.bits,
    #     pad=hp.voc_pad,
    #     upsample_factors=hp.voc_upsample_factors,
    #     feat_dims=hp.num_mels,
    #     compute_dims=hp.voc_compute_dims,
    #     res_out_dims=hp.voc_res_out_dims,
    #     res_blocks=hp.voc_res_blocks,
    #     hop_length=hp.hop_length,
    #     sample_rate=hp.sample_rate,
    #     mode=hp.voc_mode
    # ).cuda()
    model = model_VC(32, 256, 512, 32).cuda()
    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters())
    for p in optimizer.param_groups:
        p["lr"] = hp.voc_lr

    loss_recon = nn.MSELoss()
    loss_content = nn.L1Loss()
    # Load the weights
    model_dir = models_dir.joinpath(run_id)
    model_dir.mkdir(exist_ok=True)
    weights_fpath = model_dir.joinpath(run_id + ".pt")
    if force_restart or not weights_fpath.exists():
        print("\nStarting the training of AutoVC from scratch\n")
        model.save(weights_fpath, optimizer)
    else:
        print("\nLoading weights at %s" % weights_fpath)
        model.load(weights_fpath, optimizer)
        print("AutoVC weights loaded from step %d" % model.step)

    # Initialize the dataset
    metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \
        voc_dir.joinpath("synthesized.txt")
    mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath(
        "mels_gta")
    wav_dir = syn_dir.joinpath("audio")
    #2019.11.26
    embed_dir = syn_dir.joinpath("embeds")

    dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir, embed_dir)
    test_loader = DataLoader(dataset,
                             batch_size=1,
                             shuffle=True,
                             pin_memory=True)

    # Begin the training
    simple_table([('Batch size', hp.voc_batch_size), ('LR', hp.voc_lr),
                  ('Sequence Len', hp.voc_seq_len)])

    for epoch in range(1, 350):

        model.train()
        data_loader = DataLoader(dataset,
                                 collate_fn=collate_vocoder,
                                 batch_size=hp.voc_batch_size,
                                 num_workers=2,
                                 shuffle=True,
                                 pin_memory=True)
        start = time.time()
        running_loss = 0.

        for i, (m, e, _) in enumerate(data_loader, 1):
            #print("e:",e.shape)
            #print("m:",m.shape)
            model.train()
            m, e = m.cuda(), e.cuda()
            # Forward pass
            C, X_C, X_before, X_after, _ = model(m, e, e)

            #c_org shape: torch.Size([100, 256, 1])
            #x shape: torch.Size([100, 80, 544])
            #c_org_expand shape torch.Size([100, 256, 544])
            #encoder_outputs shape: torch.Size([100, 544, 320])
            #C shape: torch.Size([100, 544, 64])
            #X shape: torch.Size([100, 1, 544, 80])
            X_after = X_after.squeeze(1).permute(0, 2, 1)
            X_before = X_before.squeeze(1).permute(0, 2, 1)

            #print("C shape:",C.shape)
            #if X_C:
            #    print("X_C shape:",X_C.shape)
            #print("X shape:",X.shape)
            # Backward pass
            loss_rec_before = loss_recon(X_before, m)
            loss_rec_after = loss_recon(X_after, m)
            loss_c = loss_content(C, X_C)
            loss = loss_rec_before + loss_rec_after + loss_c
            #print("recon loss:",loss1)
            #print("content loss:",loss2)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print("loss:",loss.item())
            running_loss += loss.item()
            #print("running loss:",running_loss)
            speed = i / (time.time() - start)
            avg_loss = running_loss / i
            #print("avg_loss:",avg_loss)
            step = model.get_step()

            if hp.decay_learning_rate == True:
                p["lr"] = _learning_rate_decay(p["lr"], step)
            k = step // 1000
            if step % 100 == 0 and step != 0:
                model.eval()
                plt.figure(1)
                C, X_C, X_before, X_after, _ = model(m, e, e)
                X_after = X_after.squeeze(1).permute(0, 2, 1)
                mel_out = torch.tensor(X_after).clone().detach().cpu().numpy()

                from synthesizer import audio
                from synthesizer.hparams import hparams
                wav = audio.inv_mel_spectrogram(mel_out[0, :, :], hparams)
                librosa.output.write_wav("out.wav", np.float32(wav),
                                         hparams.sample_rate)

                mel_out = mel_out[0, :, :].transpose(1, 0)
                plt.imshow(mel_out.T, interpolation='nearest', aspect='auto')
                plt.title("Generate Spectrogram")
                save_path = model_dir
                p_path = save_path.joinpath("generate.png")
                plt.savefig(p_path)

                plt.figure(2)
                m_out = m.squeeze(1).permute(0, 2, 1)
                m_out = torch.tensor(m).clone().detach().cpu().numpy()
                m_out = m_out[0, :, :].transpose(1, 0)
                plt.imshow(m_out.T, interpolation='nearest', aspect='auto')
                plt.title("Orignal Spectrogram")
                o_path = save_path.joinpath("orignal.png")
                plt.savefig(o_path)

            if backup_every != 0 and step % backup_every == 0:
                model.checkpoint(model_dir, optimizer)

            if save_every != 0 and step % save_every == 0:
                model.save(weights_fpath, optimizer)
                torch.save(model, "model_ttsdb_48_48.pkl")

            msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \
                f"Loss: {avg_loss:.4f} | {speed:.1f} " \
                f"steps/s | Step: {k}k | "
            stream(msg)

    # gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,hp.voc_target,model_dir)
        print("")
Exemple #4
0
    def synthesize_spectrograms(self,
                                texts: List[str],
                                embeddings: Union[np.ndarray,
                                                  List[np.ndarray]],
                                return_alignments=False):
        """
        Synthesizes mel spectrograms from texts and speaker embeddings.

        :param texts: a list of N text prompts to be synthesized
        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 
        :param return_alignments: if True, a matrix representing the alignments between the 
        characters
        and each decoder output step will be returned for each spectrogram
        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 
        sequence length of spectrogram i, and possibly the alignments.
        """
        # Load the model on the first request.
        if not self.is_loaded():
            self.load()

            # Print some info about the model when it is loaded
            tts_k = self._model.get_step() // 1000

            simple_table([("Tacotron", str(tts_k) + "k"),
                          ("r", self._model.r)])

        # Preprocess text inputs
        inputs = [
            text_to_sequence(text.strip(), hparams.tts_cleaner_names)
            for text in texts
        ]
        if not isinstance(embeddings, list):
            embeddings = [embeddings]

        # Batch inputs
        batched_inputs = [
            inputs[i:i + hparams.synthesis_batch_size]
            for i in range(0, len(inputs), hparams.synthesis_batch_size)
        ]
        batched_embeds = [
            embeddings[i:i + hparams.synthesis_batch_size]
            for i in range(0, len(embeddings), hparams.synthesis_batch_size)
        ]

        specs = []
        for i, batch in enumerate(batched_inputs, 1):
            if self.verbose:
                print(f"\n| Generating {i}/{len(batched_inputs)}")

            # Pad texts so they are all the same length
            text_lens = [len(text) for text in batch]
            max_text_len = max(text_lens)
            chars = [pad1d(text, max_text_len) for text in batch]
            chars = np.stack(chars)

            # Stack speaker embeddings into 2D array for batch processing
            speaker_embeds = np.stack(batched_embeds[i - 1])

            # Convert to tensor
            chars = torch.tensor(chars).long().to(self.device)
            speaker_embeddings = torch.tensor(speaker_embeds).float().to(
                self.device)

            # Inference
            _, mels, alignments = self._model.generate(chars,
                                                       speaker_embeddings)
            mels = mels.detach().cpu().numpy()
            for m in mels:
                # Trim silence from end of each spectrogram
                while np.max(m[:, -1]) < hparams.tts_stop_threshold:
                    m = m[:, :-1]
                specs.append(m)

        if self.verbose:
            print("\n\nDone.\n")
        return (specs, alignments) if return_alignments else specs
def train(run_id: str, models_dir: Path, metadata_path: Path,
          weights_path: Path, ground_truth: bool, save_every: int,
          backup_every: int, force_restart: bool):
    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    # Instantiate the model
    print("Initializing the model...")
    model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                    fc_dims=hp.voc_fc_dims,
                    bits=hp.bits,
                    pad=hp.voc_pad,
                    upsample_factors=hp.voc_upsample_factors,
                    feat_dims=hp.num_mels,
                    compute_dims=hp.voc_compute_dims,
                    res_out_dims=hp.voc_res_out_dims,
                    res_blocks=hp.voc_res_blocks,
                    hop_length=hp.hop_length,
                    sample_rate=hp.sample_rate,
                    mode=hp.voc_mode).cuda()

    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters())
    for p in optimizer.param_groups:
        p["lr"] = hp.voc_lr
    loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss

    # Load the weights
    model_dir = models_dir.joinpath(run_id)
    model_dir.mkdir(exist_ok=True)
    weights_fpath = weights_path
    metadata_fpath = metadata_path

    if force_restart:
        print("\nStarting the training of WaveRNN from scratch\n")
        model.save(weights_fpath, optimizer)
    else:
        print("\nLoading weights at %s" % weights_fpath)
        model.load(weights_fpath, optimizer)
        print("WaveRNN weights loaded from step %d" % model.step)

    # Initialize the dataset

    dataset = VocoderDataset(metadata_fpath)

    test_loader = DataLoader(dataset,
                             batch_size=1,
                             shuffle=True,
                             pin_memory=True)

    # Begin the training
    simple_table([('Batch size', hp.voc_batch_size), ('LR', hp.voc_lr),
                  ('Sequence Len', hp.voc_seq_len)])

    epoch_start = int(
        (model.step - 428000) * 110 / dataset.get_number_of_samples())
    epoch_end = 200

    log_path = os.path.join(models_dir, "logs")
    if not os.path.isdir(log_path):
        os.mkdir(log_path)

    writer = SummaryWriter(log_path)
    print("Log path : " + log_path)

    print("Starting from epoch: " + str(epoch_start))

    for epoch in range(epoch_start, epoch_start + epoch_end):
        data_loader = DataLoader(dataset,
                                 collate_fn=collate_vocoder,
                                 batch_size=hp.voc_batch_size,
                                 num_workers=2,
                                 shuffle=True,
                                 pin_memory=True)
        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(data_loader, 1):
            x, m, y = x.cuda(), m.cuda(), y.cuda()

            # Forward pass
            y_hat = model(x, m)
            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
            elif model.mode == 'MOL':
                y = y.float()
            y = y.unsqueeze(-1)

            # Backward pass
            loss = loss_func(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            speed = i / (time.time() - start)
            avg_loss = running_loss / i

            step = model.get_step()
            k = step // 1000

            if backup_every != 0 and step % backup_every == 0:
                model.checkpoint(model_dir, optimizer)

            # if save_every != 0 and step % save_every == 0 :
            #     model.save(weights_fpath, optimizer)

            if step % 500 == 0:
                writer.add_scalar('Loss/train', avg_loss,
                                  round(step / 1000, 1))
                msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \
                    f"Loss: {avg_loss:.4f} | {speed:.1f} " \
                    f"steps/s | Step: {k}k | "
                print(msg, flush=True)

            if step % 15000 == 0:
                gen_testset(model, test_loader, hp.voc_gen_at_checkpoint,
                            hp.voc_gen_batched, hp.voc_target, hp.voc_overlap,
                            model_dir)
                gen_meltest(model, hp.voc_gen_batched, hp.voc_target,
                            hp.voc_overlap, model_dir)