Beispiel #1
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap,
                save_path: Path):

    k = model.get_step() // 1000

    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        x = x[0].numpy()

        bits = 16 if hp.voc_mode == 'MOL' else hp.bits

        if hp.mu_law and hp.voc_mode != 'MOL':
            x = decode_mu_law(x, 2**bits, from_labels=True)
        else:
            x = label_2_float(x, bits)

        save_wav(x,
                 save_path / '%sk_steps_%s_target.wav' % (repr1(k), repr1(i)))

        batch_str = 'gen_batched_target%s_overlap%s' % (
            repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path / '%sk_steps_%s_%s.wav' %
                       (repr1(k), repr1(i), repr1(batch_str)))

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
Beispiel #2
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path):

    k = model.get_step() // 1000

    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        x = x[0].numpy()

        bits = 16 if hp.voc_mode == 'MOL' else hp.bits

        if hp.mu_law and hp.voc_mode != 'MOL':
            x = decode_mu_law(x, 2**bits, from_labels=True)
        else:
            x = label_2_float(x, bits)

        save_wav(x, save_path/f'{k}k_steps_{i}_target.wav')

        batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav')

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
Beispiel #3
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap):

    k = model.get_step() // 1000
    file_name = load_path.stem

    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(wav, save_path/f'__{file_name}__{k}k_steps_target.wav')
        mel = melspectrogram(wav)
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!')
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]')
    else:
        raise ValueError(f"Expected an extension of .wav or .npy, but got {suffix}!")


    mel = torch.tensor(mel).unsqueeze(0)

    batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
    save_str = save_path/f'__{file_name}__{k}k_steps_{batch_str}.wav'

    _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def synthesize(text):
    input = text + "|00-" + lang + "|" + lang

    # Change to Multi_TTS path
    sys.path.append(
        os.path.join(os.path.dirname(__file__),
                     "dependencies/Multilingual_Text_to_Speech"))

    if "utils" in sys.modules: del sys.modules["utils"]

    from synthesize import synthesize
    from utils import build_model

    # Load Mulilingual pretrained model
    model = build_model(
        os.path.abspath("./dependencies/checkpoints/generated_switching.pyt"))
    model.eval()

    # generate spectogram
    spectogram = synthesize(model, "|" + input)

    # Change to WaveRNN Path
    sys.path.append(
        os.path.join(os.path.dirname(__file__), "dependencies/WaveRNN"))

    if "utils" in sys.modules: del sys.modules["utils"]

    from models.fatchord_version import WaveRNN
    from utils import hparams as hp
    from gen_wavernn import generate
    import torch

    # Load WaveRNN pretrained model
    hp.configure("hparams.py")
    model = WaveRNN(
        rnn_dims=hp.voc_rnn_dims,
        fc_dims=hp.voc_fc_dims,
        bits=hp.bits,
        pad=hp.voc_pad,
        upsample_factors=hp.voc_upsample_factors,
        feat_dims=hp.num_mels,
        compute_dims=hp.voc_compute_dims,
        res_out_dims=hp.voc_res_out_dims,
        res_blocks=hp.voc_res_blocks,
        hop_length=hp.hop_length,
        sample_rate=hp.sample_rate,
        mode=hp.voc_mode).to(
            torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    model.load(
        os.path.join(os.path.dirname(__file__),
                     "dependencies/checkpoints/wavernn_weight.pyt"))

    waveform = generate(model, s, hp.voc_gen_batched, hp.voc_target,
                        hp.voc_overlap)

    f = write("./temp/result.wav", "x")
    f.write(waveform)
    f.close()
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap):

    k = model.get_step() // 1000
    os.makedirs(save_path/'test', exist_ok=True)
    for file_name in tqdm(os.listdir(load_path)):
        if file_name.endswith('.npy'):
            mel = np.load(os.path.join(load_path, file_name))
            mel = torch.tensor(mel).unsqueeze(0)

            batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
            save_str = save_path/f'test/{file_name}__{k}k_steps_{batch_str}.wav'

            _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
 def train(self,
           model: WaveRNN,
           optimizer: Optimizer,
           train_gta=False) -> None:
     voc_schedule = self.train_cfg['schedule']
     voc_schedule = parse_schedule(voc_schedule)
     for i, session_params in enumerate(voc_schedule, 1):
         lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set, val_set_samples = get_vocoder_datasets(
                 path=self.paths.data,
                 batch_size=bs,
                 train_gta=train_gta,
                 max_mel_len=self.train_cfg['max_mel_len'],
                 hop_length=self.dsp.hop_length,
                 voc_pad=model.pad,
                 voc_seq_len=self.train_cfg['seq_len'],
                 voc_mode=self.dsp.voc_mode,
                 bits=self.dsp.bits,
                 num_gen_samples=self.train_cfg['num_gen_samples'])
             session = VocSession(index=i,
                                  lr=lr,
                                  max_step=max_step,
                                  bs=bs,
                                  train_set=train_set,
                                  val_set=val_set,
                                  val_set_samples=val_set_samples)
             self.train_session(model, optimizer, session, train_gta)
 def evaluate(self, model: WaveRNN, val_set: Dataset) -> float:
     model.eval()
     val_loss = 0
     device = next(model.parameters()).device
     for i, (x, y, m) in enumerate(val_set, 1):
         x, m, y = x.to(device), m.to(device), y.to(device)
         with torch.no_grad():
             y_hat = model(x, m)
             if model.mode == 'RAW':
                 y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
             elif model.mode == 'MOL':
                 y = y.float()
             y = y.unsqueeze(-1)
             loss = self.loss_func(y_hat, y)
             val_loss += loss.item()
     return val_loss / len(val_set)
Beispiel #8
0
def wave_rnn(pretrained=True, **kwargs):
    model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                    fc_dims=hp.voc_fc_dims,
                    bits=hp.bits,
                    pad=hp.voc_pad,
                    upsample_factors=hp.voc_upsample_factors,
                    feat_dims=hp.num_mels,
                    compute_dims=hp.voc_compute_dims,
                    res_out_dims=hp.voc_res_out_dims,
                    res_blocks=hp.voc_res_blocks,
                    hop_length=hp.hop_length,
                    sample_rate=hp.sample_rate,
                    mode=hp.voc_mode)
    if pretrained:
        state_dict = fetch_and_load_state_dict("wavernn")
        model.load_state_dict(state_dict)
    return model
def load_wavernn(checkpoint_path: str) -> Tuple[WaveRNN, Dict[str, Any]]:
    print(f'Loading voc checkpoint {checkpoint_path}')
    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    config = checkpoint['config']
    voc_model = WaveRNN.from_config(config)
    voc_model.load_state_dict(checkpoint['model'])
    print(f'Loaded model with step {voc_model.get_step()}')
    return voc_model, config
Beispiel #10
0
 def evaluate(self, model: WaveRNN, val_set: Dataset) -> float:
     model.eval()
     val_loss = 0
     device = next(model.parameters()).device
     for i, batch in enumerate(val_set, 1):
         batch = to_device(batch, device=device)
         x, y, m = batch['x'], batch['y'], batch['mel']
         with torch.no_grad():
             y_hat = model(x, m)
             if model.mode == 'RAW':
                 y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
             elif model.mode == 'MOL':
                 y = y.float()
             y = y.unsqueeze(-1)
             loss = self.loss_func(y_hat, y)
             val_loss += loss.item()
     return val_loss / len(val_set)
Beispiel #11
0
    def generate_samples(self, model: WaveRNN,
                         session: VocSession) -> Tuple[float, list]:
        """
        Generates audio samples to cherry-pick models. To evaluate audio quality
        we calculate the l1 distance between mels of predictions and targets.
        """
        model.eval()
        mel_losses = []
        gen_wavs = []
        device = next(model.parameters()).device
        for i, sample in enumerate(session.val_set_samples, 1):
            m, x = sample['mel'], sample['x']
            if i > self.train_cfg['num_gen_samples']:
                break
            x = x[0].numpy()
            bits = 16 if self.dsp.voc_mode == 'MOL' else self.dsp.bits
            if self.dsp.mu_law and self.dsp.voc_mode != 'MOL':
                x = DSP.decode_mu_law(x, 2**bits, from_labels=True)
            else:
                x = DSP.label_2_float(x, bits)
            gen_wav = model.generate(mels=m,
                                     batched=self.train_cfg['gen_batched'],
                                     target=self.train_cfg['target'],
                                     overlap=self.train_cfg['overlap'],
                                     mu_law=self.dsp.mu_law,
                                     silent=True)

            gen_wavs.append(gen_wav)
            y_mel = self.dsp.wav_to_mel(x.squeeze(), normalize=False)
            y_mel = torch.tensor(y_mel).to(device)
            y_hat_mel = self.dsp.wav_to_mel(gen_wav, normalize=False)
            y_hat_mel = torch.tensor(y_hat_mel).to(device)
            loss = F.l1_loss(y_hat_mel, y_mel)
            mel_losses.append(loss.item())

            self.writer.add_audio(tag=f'Validation_Samples/target_{i}',
                                  snd_tensor=x,
                                  global_step=model.step,
                                  sample_rate=self.dsp.sample_rate)
            self.writer.add_audio(tag=f'Validation_Samples/generated_{i}',
                                  snd_tensor=gen_wav,
                                  global_step=model.step,
                                  sample_rate=self.dsp.sample_rate)

        return sum(mel_losses) / len(mel_losses), gen_wavs[0]
Beispiel #12
0
    def generate_samples(self, model: WaveRNN,
                         session: VocSession) -> Tuple[float, list]:
        """
        Generates audio samples to cherry-pick models. To evaluate audio quality
        we calculate the l1 distance between mels of predictions and targets.
        """
        model.eval()
        mel_losses = []
        gen_wavs = []
        device = next(model.parameters()).device
        for i, (m, x) in enumerate(session.val_set_samples, 1):
            if i > hp.voc_gen_num_samples:
                break
            x = x[0].numpy()
            bits = 16 if hp.voc_mode == 'MOL' else hp.bits
            if hp.mu_law and hp.voc_mode != 'MOL':
                x = decode_mu_law(x, 2**bits, from_labels=True)
            else:
                x = label_2_float(x, bits)
            gen_wav = model.generate(mels=m,
                                     save_path=None,
                                     batched=hp.voc_gen_batched,
                                     target=hp.voc_target,
                                     overlap=hp.voc_overlap,
                                     mu_law=hp.mu_law,
                                     silent=True)

            gen_wavs.append(gen_wav)
            y_mel = raw_melspec(x.squeeze())
            y_mel = torch.tensor(y_mel).to(device)
            y_hat_mel = raw_melspec(gen_wav)
            y_hat_mel = torch.tensor(y_hat_mel).to(device)
            loss = F.l1_loss(y_hat_mel, y_mel)
            mel_losses.append(loss.item())

            self.writer.add_audio(tag=f'Validation_Samples/target_{i}',
                                  snd_tensor=x,
                                  global_step=model.step,
                                  sample_rate=hp.sample_rate)
            self.writer.add_audio(tag=f'Validation_Samples/generated_{i}',
                                  snd_tensor=gen_wav,
                                  global_step=model.step,
                                  sample_rate=hp.sample_rate)

        return sum(mel_losses) / len(mel_losses), gen_wavs[0]
def get_wavernn_model(model_path):
    device = torch.device('cuda')
    print()
    model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                    fc_dims=hp.voc_fc_dims,
                    bits=hp.bits,
                    pad=hp.voc_pad,
                    upsample_factors=hp.voc_upsample_factors,
                    feat_dims=hp.num_mels,
                    compute_dims=hp.voc_compute_dims,
                    res_out_dims=hp.voc_res_out_dims,
                    res_blocks=hp.voc_res_blocks,
                    hop_length=hp.hop_length,
                    sample_rate=hp.sample_rate,
                    mode=hp.voc_mode).to(device)

    model.load(model_path)
    return model
Beispiel #14
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path):

    k = model.get_step() // 1000

    mypqmf = PQMF()
    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        if hp.voc_multiband:

            x = x[0].numpy()

            bits = 16 if hp.voc_mode == 'MOL' else hp.bits

            if hp.mu_law and hp.voc_mode != 'MOL':
                x = decode_mu_law(x, 2 ** bits, from_labels=True)
            else:
                x = label_2_float(x, bits)

            source = mypqmf.synthesis(
                torch.tensor(x, dtype=torch.float).unsqueeze(
                    0)).numpy()  # (1, sub_band, T//sub_band) -> (1, 1, T)
            source = source.squeeze() # (T,)
            save_wav(source,save_path/f'{k}k_steps_{i}_target.wav')
            # np.save(save_path/f'{k}k_steps_{i}_target.npy', x, allow_pickle=False)

        else:
            x = x[0].numpy()

            bits = 16 if hp.voc_mode == 'MOL' else hp.bits

            if hp.mu_law and hp.voc_mode != 'MOL':
                x = decode_mu_law(x, 2**bits, from_labels=True)
            else:
                x = label_2_float(x, bits)

            save_wav(x, save_path/f'{k}k_steps_{i}_target.wav')

        batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav')   # 返回PQMF后
        _ = model.generate(m, save_str,batched, target, overlap, hp.mu_law)
Beispiel #15
0
 def train(self, model: WaveRNN, optimizer: Optimizer, train_gta=False) -> None:
     for i, session_params in enumerate(hp.voc_schedule, 1):
         lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set, val_set_samples = get_vocoder_datasets(
                 path=self.paths.data, batch_size=bs, train_gta=train_gta)
             session = VocSession(
                 index=i, lr=lr, max_step=max_step,
                 bs=bs, train_set=train_set, val_set=val_set,
                 val_set_samples=val_set_samples)
             self.train_session(model, optimizer, session, train_gta)
Beispiel #16
0
 def __init__(self, tts_path: str, voc_path: str, device='cuda'):
     self.device = torch.device(device)
     tts_checkpoint = torch.load(tts_path, map_location=self.device)
     tts_config = tts_checkpoint['config']
     tts_model = ForwardTacotron.from_config(tts_config)
     tts_model.load_state_dict(tts_checkpoint['model'])
     self.tts_model = tts_model
     self.wavernn = WaveRNN.from_checkpoint(voc_path)
     self.melgan = torch.hub.load('seungwonpark/melgan', 'melgan')
     self.melgan.to(device).eval()
     self.cleaner = Cleaner.from_config(tts_config)
     self.tokenizer = Tokenizer()
     self.dsp = DSP.from_config(tts_config)
Beispiel #17
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched,
                  target, overlap):

    k = model.get_step() // 1000
    file_name = load_path.stem

    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(wav, save_path / f'{prefix}{file_name}.target.wav')
        mel = melspectrogram(wav)
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(
                f'Expected a numpy array shaped (n_mels, n_hops), but got {mel.shape}!'
            )
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(
                f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]'
            )
    else:
        raise ValueError(
            f"Expected an extension of .wav or .npy, but got {suffix}!")

    m = torch.tensor(mel).unsqueeze(0)

    save_str_wavernn = save_path / f'{prefix}{file_name}.wavernn.wav'
    save_str_griffinlim = save_path / f'{prefix}{file_name}.griffinlim.wav'

    wav = reconstruct_waveform(mel, n_iter=32)
    save_wav(wav, save_str_griffinlim)

    _ = model.generate(m, save_str_wavernn, batched, target, overlap,
                       hp.mu_law)
Beispiel #18
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched,
                  target, overlap):

    k = model.get_step() // 1000
    file_name = load_path.stem

    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(
            wav, save_path / '__%s__%sk_steps_target.wav' %
            (repr1(file_name), repr1(k)))
        mel = melspectrogram(wav)
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(
                'Expected a numpy array shaped (n_mels, n_hops), but got %s!' %
                (repr1(wav.shape)))
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(
                'Expected spectrogram range in [0,1] but was instead [%s, %s]'
                % (repr1(_min), repr1(_max)))
    else:
        raise ValueError('Expected an extension of .wav or .npy, but got %s!' %
                         (repr1(suffix)))

    mel = torch.tensor(mel).unsqueeze(0)

    batch_str = 'gen_batched_target%s_overlap%s' % (
        repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED'
    save_str = save_path / '__%s__%sk_steps_%s.wav' % (
        repr1(file_name), repr1(k), repr1(batch_str))

    _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
Beispiel #19
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap,
                save_path: Path):
    '''
    :param model:
    :param test_set: 测试集,包含了mel或sp+f0特征,以及原音频的载入文件
    :param samples: 要生成的样本量,也就是要生成的音频个数
    :param batched: 在这个脚本中batched为True
    :param target: 11000
    :param overlap: 550
    :param save_path: model_outputs_*
    :return: 生成的音频文件
    '''

    k = model.get_step() // 1000

    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        x = x[0].numpy()

        bits = 16 if hp.voc_mode == 'MOL' else hp.bits

        if hp.mu_law and hp.voc_mode != 'MOL':
            x = decode_mu_law(x, 2**bits, from_labels=True)
        else:
            x = label_2_float(x, bits)

        save_wav(x, save_path / f'{k}k_steps_{i}_target.wav')  # 保存原音频文件

        batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path / f'{k}k_steps_{i}_{batch_str}.wav')

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
Beispiel #20
0
def gen_testset(model: WaveRNN, test_set_wav, samples, batched, target,
                overlap, save_path: Path):
    '''
    :param model:
    :param test_set: 测试集,包含了mel或sp+f0特征,以及原音频的载入文件
    :param samples: 要生成的样本量,也就是要生成的音频个数
    :param batched: 在这个脚本中batched为True
    :param target: 11000
    :param overlap: 550
    :param save_path: model_outputs_*
    :return: 生成的音频文件
    '''

    c = 0
    for i in os.listdir(test_set_wav):
        m = np.expand_dims(np.load(join(test_set_wav, i)).T, 0)
        filenname = basename(i)[:-4]

        wave_path = "/emotion_wav/"
        save_str = wave_path + str(filenname) + ".wav"

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
Beispiel #21
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched,
                  target, overlap):
    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(
            wav, os.path.join(save_path, "target",
                              os.path.basename(load_path)))
        print("Generating from {0}".format(load_path))
        mel = melspectrogram(wav)
        print("Melspectrograms generated!")
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(
                f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!'
            )
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(
                f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]'
            )
    else:
        raise ValueError(
            f"Expected an extension of .wav or .npy, but got {suffix}!")

    mel = torch.tensor(mel).unsqueeze(0)

    batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
    save_str = os.path.join(save_path, os.path.basename(load_path))

    beg = time.time()
    print("Start generating... [{0}]".format(beg))
    output = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
    end = time.time()
    print("Done generating... [{0}] -> delta: [{1}]".format(end, end - beg))
    save_wav(output, save_str)
Beispiel #22
0

print("Cur Dir", os.getcwd())

if "utils" in sys.modules:
    del sys.modules["utils"]

sys.path.append(WAVERNN_FOLDER)

from gen_wavernn import generate
from utils import hparams as hp
from models.fatchord_version import WaveRNN

hp.configure(WAVERNN_FOLDER+'/hparams.py')
model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors,
                feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks,
                hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to('cpu')
model.load(CHECKPOINTS_FOLDER + "/" + wavernn_chpt)

y = []

ix=1
while os.path.exists(CHR_FOLDER+"/"+str(ix)+".npy"):
    print("Found", CHR_FOLDER+"/"+str(ix)+".npy")
    y.append(np.load(CHR_FOLDER+"/"+str(ix)+".npy"))
    ix+=1

idx=1
for s in y:
    waveform = generate(model, s, hp.voc_gen_batched,
                      hp.voc_target, hp.voc_overlap)
Beispiel #23
0
def main():

    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train WaveRNN Vocoder')
    parser.add_argument('--lr',
                        '-l',
                        type=float,
                        help='[float] override hparams.py learning rate')
    parser.add_argument('--batch_size',
                        '-b',
                        type=int,
                        help='[int] override hparams.py batch size')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--gta',
                        '-g',
                        action='store_true',
                        help='train wavernn on GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # load hparams from file
    if args.lr is None:
        args.lr = hp.voc_lr
    if args.batch_size is None:
        args.batch_size = hp.voc_batch_size

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    batch_size = args.batch_size
    force_train = args.force_train
    train_gta = args.gta
    lr = args.lr

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        if batch_size % torch.cuda.device_count() != 0:
            raise ValueError(
                '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    print('\nInitialising Model...\n')

    # Instantiate WaveRNN Model
    voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                        fc_dims=hp.voc_fc_dims,
                        bits=hp.bits,
                        pad=hp.voc_pad,
                        upsample_factors=hp.voc_upsample_factors,
                        feat_dims=hp.num_mels,
                        compute_dims=hp.voc_compute_dims,
                        res_out_dims=hp.voc_res_out_dims,
                        res_blocks=hp.voc_res_blocks,
                        hop_length=hp.hop_length,
                        sample_rate=hp.sample_rate,
                        mode=hp.voc_mode).to(device)

    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    optimizer = optim.Adam(voc_model.parameters())
    restore_checkpoint('voc',
                       paths,
                       voc_model,
                       optimizer,
                       create_if_missing=True)

    train_set, test_set = get_vocoder_datasets(paths.data, batch_size,
                                               train_gta)

    total_steps = 10_000_000 if force_train else hp.voc_total_steps

    simple_table([
        ('Remaining', str(
            (total_steps - voc_model.get_step()) // 1000) + 'k Steps'),
        ('Batch Size', batch_size), ('LR', lr),
        ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta)
    ])

    loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss

    voc_train_loop(paths, voc_model, loss_func, optimizer, train_set, test_set,
                   lr, total_steps)

    print('Training Complete.')
    print(
        'To continue training increase voc_total_steps in hparams.py or use --force_train'
    )
Beispiel #24
0
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer,
                   train_set, test_set, lr, total_steps):
    # Use same device as model parameters
    device = next(model.parameters()).device

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.to(device), m.to(device), y.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                y_hat = data_parallel_workaround(model, x, m)
            else:
                y_hat = model(x, m)

            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL':
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            optimizer.zero_grad()
            loss.backward()
            if hp.voc_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.voc_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')
            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.voc_checkpoint_every == 0:
                gen_testset(model, test_set, hp.voc_gen_at_checkpoint,
                            hp.voc_gen_batched, hp.voc_target, hp.voc_overlap,
                            paths.voc_output)
                ckpt_name = f'wave_step{k}K'
                save_checkpoint('voc',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('voc', paths, model, optimizer, is_silent=True)
        model.log(paths.voc_log, msg)
        print(' ')
Beispiel #25
0
    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    print('\nInitialising WaveRNN Model...\n')

    # Instantiate WaveRNN Model
    voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                        fc_dims=hp.voc_fc_dims,
                        bits=hp.bits,
                        pad=hp.voc_pad,
                        upsample_factors=hp.voc_upsample_factors,
                        feat_dims=hp.num_mels,
                        compute_dims=hp.voc_compute_dims,
                        res_out_dims=hp.voc_res_out_dims,
                        res_blocks=hp.voc_res_blocks,
                        hop_length=hp.hop_length,
                        sample_rate=hp.sample_rate,
                        mode='MOL').to(device)

    voc_model.load('quick_start/voc_weights/latest_weights.pyt')

    print('\nInitialising Tacotron Model...\n')

    # Instantiate Tacotron Model
    tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                         num_chars=len(symbols),
                         encoder_dims=hp.tts_encoder_dims,
                         decoder_dims=hp.tts_decoder_dims,
Beispiel #26
0
    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    if args.vocoder == 'wavernn':
        print('\nInitialising WaveRNN Model...\n')
        # Instantiate WaveRNN Model
        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                            fc_dims=hp.voc_fc_dims,
                            bits=hp.bits,
                            pad=hp.voc_pad,
                            upsample_factors=hp.voc_upsample_factors,
                            feat_dims=hp.num_mels,
                            compute_dims=hp.voc_compute_dims,
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode=hp.voc_mode).to(device)

        voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights
        voc_model.load(voc_load_path)

    print('\nInitialising Forward TTS Model...\n')
    tts_model = ForwardTacotron(
        embed_dims=hp.forward_embed_dims,
        num_chars=len(phonemes),
        durpred_rnn_dims=hp.forward_durpred_rnn_dims,
        durpred_conv_dims=hp.forward_durpred_conv_dims,
Beispiel #27
0
    gta = args.gta

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    print('\nInitialising Model...\n')

    model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                    fc_dims=hp.voc_fc_dims,
                    bits=hp.bits,
                    pad=hp.voc_pad,
                    upsample_factors=hp.voc_upsample_factors,
                    feat_dims=hp.num_mels,
                    compute_dims=hp.voc_compute_dims,
                    res_out_dims=hp.voc_res_out_dims,
                    res_blocks=hp.voc_res_blocks,
                    hop_length=hp.hop_length,
                    sample_rate=hp.sample_rate,
                    mode=hp.voc_mode).to(device)

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    voc_weights = args.voc_weights if args.voc_weights else paths.voc_latest_weights

    model.load(voc_weights)

    simple_table([('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', target if batched else 'N/A'),
                  ('Overlap Samples', overlap if batched else 'N/A')])
Beispiel #28
0
    def train_session(self, model: WaveRNN, optimizer: Optimizer,
                      session: VocSession, train_gta: bool) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps ', str(training_steps // 1000) + 'k'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr),
                      ('Sequence Length', self.train_cfg['seq_len']),
                      ('GTA Training', train_gta)])
        for g in optimizer.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters

        for e in range(1, epochs + 1):
            for i, batch in enumerate(session.train_set, 1):
                start = time.time()
                model.train()
                batch = to_device(batch, device=device)
                x, y = batch['x'], batch['y']
                y_hat = model(x, batch['mel'])
                if model.mode == 'RAW':
                    y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
                elif model.mode == 'MOL':
                    y = batch['y'].float()
                y = y.unsqueeze(-1)

                loss = self.loss_func(y_hat, y)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.train_cfg['clip_grad_norm'])
                optimizer.step()
                loss_avg.add(loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % self.train_cfg['gen_samples_every'] == 0:
                    stream(msg + 'generating samples...')
                    gen_result = self.generate_samples(model, session)
                    if gen_result is not None:
                        mel_loss, gen_wav = gen_result
                        self.writer.add_scalar('Loss/generated_mel_l1',
                                               mel_loss, model.get_step())
                        self.track_top_models(mel_loss, gen_wav, model)

                if step % self.train_cfg['checkpoint_every'] == 0:
                    save_checkpoint(model=model,
                                    optim=optimizer,
                                    config=self.config,
                                    path=self.paths.voc_checkpoints /
                                    f'wavernn_step{k}k.pt')

                self.writer.add_scalar('Loss/train', loss, model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_loss = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Loss/val', val_loss, model.get_step())
            save_checkpoint(model=model,
                            optim=optimizer,
                            config=self.config,
                            path=self.paths.voc_checkpoints /
                            'latest_model.pt')

            loss_avg.reset()
            duration_avg.reset()
            print(' ')
Beispiel #29
0
    parser.add_argument('--gta',
                        '-g',
                        action='store_true',
                        help='train wavernn on GTA features')
    parser.add_argument('--config',
                        metavar='FILE',
                        default='config.yaml',
                        help='The config containing all hyperparams.')
    args = parser.parse_args()

    config = read_config(args.config)
    paths = Paths(config['data_path'], config['voc_model_id'],
                  config['tts_model_id'])
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    print('Using device:', device)
    print('\nInitialising Model...\n')
    voc_model = WaveRNN.from_config(config).to(device)
    dsp = DSP.from_config(config)
    assert np.cumprod(
        config['vocoder']['model']['upsample_factors'])[-1] == dsp.hop_length

    optimizer = optim.Adam(voc_model.parameters())
    restore_checkpoint(model=voc_model,
                       optim=optimizer,
                       path=paths.voc_checkpoints / 'latest_model.pt',
                       device=device)

    voc_trainer = VocTrainer(paths=paths, dsp=dsp, config=config)
    voc_trainer.train(voc_model, optimizer, train_gta=args.gta)
Beispiel #30
0
def thak():
    class Tshamsoo():
        force_cpu = os.getenv('FORCE_CPU', False)
        hp_file = 'hparams.py'
        vocoder = os.getenv('VOCODER', 'wavernn')
        batched = os.getenv('BATCHED', True)
        target = os.getenv('TARGET', None)
        overlap = os.getenv('OVERLAP', None)
        tts_weights = None
        save_attn = os.getenv('SAVE_ATTN', False)
        voc_weights = None
        iters = os.getenv('GL_ITERS', 32)

    args = Tshamsoo()
    if args.vocoder in ['griffinlim', 'gl']:
        args.vocoder = 'griffinlim'
    elif args.vocoder in ['wavernn', 'wr']:
        args.vocoder = 'wavernn'
    else:
        raise argparse.ArgumentError('Must provide a valid vocoder type!')

    hp.configure(args.hp_file)  # Load hparams from file

    tts_weights = args.tts_weights
    save_attn = args.save_attn

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    if args.vocoder == 'wavernn':
        # set defaults for any arguments that depend on hparams
        if args.target is None:
            args.target = hp.voc_target
        if args.overlap is None:
            args.overlap = hp.voc_overlap
        if args.batched is None:
            args.batched = hp.voc_gen_batched

        batched = args.batched
        target = int(args.target)
        overlap = int(args.overlap)

        print('\nInitialising WaveRNN Model...\n')
        # Instantiate WaveRNN Model
        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                            fc_dims=hp.voc_fc_dims,
                            bits=hp.bits,
                            pad=hp.voc_pad,
                            upsample_factors=hp.voc_upsample_factors,
                            feat_dims=hp.num_mels,
                            compute_dims=hp.voc_compute_dims,
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode=hp.voc_mode).to(device)

        voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights
        voc_model.load(voc_load_path)
    else:
        voc_model = None
        batched = None
        target = None
        overlap = None

    print('\nInitialising Tacotron Model...\n')

    # Instantiate Tacotron Model
    tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                         num_chars=len(symbols),
                         encoder_dims=hp.tts_encoder_dims,
                         decoder_dims=hp.tts_decoder_dims,
                         n_mels=hp.num_mels,
                         fft_bins=hp.num_mels,
                         postnet_dims=hp.tts_postnet_dims,
                         encoder_K=hp.tts_encoder_K,
                         lstm_dims=hp.tts_lstm_dims,
                         postnet_K=hp.tts_postnet_K,
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout,
                         stop_threshold=hp.tts_stop_threshold).to(device)

    tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights
    tts_model.load(tts_load_path)
    return args, voc_model, tts_model, batched, target, overlap, save_attn