Ejemplo n.º 1
0
def acoustic_infer(data, config):
    dataloader = dataprocess.load_infer(data, model_type='acoustic')
    model = Acoustic(config)
    model.load_state_dict(
        torch.load(config.acoustic_path, map_location='cpu')['state_dict'])
    model = set_device(model, config.device)
    model.eval()

    print(
        colored('Generating acoustics with ', 'blue', attrs=['bold']) +
        config.acoustic_path)

    f0_min = dsp.midi2hz(config.min_note + 3)
    f0_min = dsp.f0_normalize(f0_min, config.min_note,
                              config.min_note + config.num_note)

    f0 = []
    rmse = []
    y_prev = set_device((torch.zeros(1, 1), torch.zeros(1, 1)), config.device)
    for batch in tqdm(dataloader, leave=False, ascii=True):
        x, y_prev, _ = set_device(batch, config.device)

        f0_gen, rmse_gen = model(x, y_prev)
        y_prev = (f0_gen, rmse_gen)

        f0_denoised = f0_gen.squeeze(0).data
        f0_denoised[f0_denoised < f0_min] = 0
        f0.append(f0_denoised)

        rmse_denoised = rmse_gen.squeeze(0).data
        rmse_denoised[rmse_denoised < 0] = 0
        rmse.append(rmse_denoised)

    return f0, rmse
Ejemplo n.º 2
0
def stft(y, config): 
    spec_fn = Spectrogram(n_fft=config.fft_size, 
                          win_length=config.win_size, 
                          hop_length=config.hop_size)
    y, spec_fn = set_device((y, spec_fn), config.device)
    spec = torch.sqrt(spec_fn(y))

    return spec
Ejemplo n.º 3
0
def main():
    config = ConfigXT()
    load = FileXT(config.audio_path)

    print(
        colored('Preprocessing audio for ', 'blue', attrs=['bold']) +
        load.basename)
    data = preprocess.preprocess(load.filename,
                                 config.speaker,
                                 config,
                                 verbose=False)
    dataloader = dataprocess.load_infer(data)

    model = Tacotron(config)
    model.load_state_dict(
        torch.load(config.model_path, map_location='cpu')['state_dict'])
    model = set_device(model, config.device)
    model.eval()

    print(
        colored('Generating mel-spectrogram with ', 'blue', attrs=['bold']) +
        config.model_path)
    mel = []
    y_prev = set_device(torch.zeros(1, config.mel_size, 1), config.device)
    for batch in tqdm(dataloader, leave=False, ascii=True):
        x, y_prev, _ = set_device(batch, config.device)

        y_gen, _ = model(x, y_prev)
        mel.append(y_gen.data)
        y_prev = y_gen[..., -1].unsqueeze(-1)

    mel = torch.cat(mel, dim=-1)
    if config.vocoder == 'wavernn':
        wave = wavernn_infer(mel, config)
    elif config.vocoder == 'waveglow':
        wave = waveglow_infer(mel, config)

    savename = config.model_path.replace('.pt', '_') + FileXT(
        config.vocoder_path).basestem + '_speaker' + str(
            config.speaker) + '_' + load.basename
    torchaudio.save(savename, wave, config.sample_rate)

    print(colored('Audio generated to ', 'blue', attrs=['bold']) + savename)
Ejemplo n.º 4
0
def istft(magnitude, phase, config):
    window = torch.hann_window(config.win_size)
    stft_matrix = torch.stack((magnitude*torch.cos(phase), magnitude*torch.sin(phase)), dim=-1)
    stft_matrix, window = set_device((stft_matrix, window), config.device)
    y = torchaudio.functional.istft(stft_matrix,
                                    n_fft=config.fft_size,
                                    hop_length=config.hop_size,
                                    win_length=config.win_size,
                                    window=window)

    return y
Ejemplo n.º 5
0
def wavernn_infer(mel, config):
    print(
        colored('Running WaveRNN with ', 'blue', attrs=['bold']) +
        config.vocoder_path)
    wavernn = WaveRNN(config)
    wavernn.load_state_dict(torch.load(config.vocoder_path,
                                       map_location='cpu'))
    wavernn = set_device(wavernn, config.device)

    wave = wavernn.infer(mel)

    return wave.cpu()
Ejemplo n.º 6
0
def durian_infer(data, config):
    dataloader = dataprocess.load_infer(data, model_type='durian')
    model = DurIAN(config)
    model.load_state_dict(
        torch.load(config.durian_path, map_location='cpu')['state_dict'])
    model = set_device(model, config.device)
    model.eval()

    print(
        colored('Generating mel-spectrogram with ', 'blue', attrs=['bold']) +
        config.durian_path)
    mel = []
    y_prev = set_device(torch.zeros(1, config.mel_size, 1), config.device)
    for batch in tqdm(dataloader, leave=False, ascii=True):
        x, _, _ = set_device(batch, config.device)

        y_gen, _ = model(x, y_prev)
        mel.append(y_gen.data)
        y_prev = y_gen[..., -1].unsqueeze(-1)

    mel = torch.cat(mel, dim=-1)

    return mel
Ejemplo n.º 7
0
def waveglow_infer(mel, config):
    print(
        colored('Running WaveGlow with ', 'blue', attrs=['bold']) +
        config.vocoder_path)

    waveglow = WaveGlow(config)
    waveglow, _, _ = load_checkpoint(config.vocoder_path, waveglow)

    #waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow = set_device(waveglow, config.device)
    waveglow.eval()

    denoiser = Denoiser(waveglow, config)
    denoiser = set_device(denoiser, config.device)

    with torch.no_grad():
        wave = waveglow.infer(mel, config.sigma).float()
        wave = denoiser(wave, strength=config.denoising_strength)

    wave = wave / torch.max(torch.abs(wave))

    return wave.cpu()
Ejemplo n.º 8
0
def melgan_infer(mel, config):
    print(
        colored('Running MelGAN with ', 'blue', attrs=['bold']) +
        config.vocoder_path)
    melgan = torch.hub.load('seungwonpark/melgan', 'melgan')
    melgan = set_device(melgan, config.device)
    melgan.eval()

    with torch.no_grad():
        wave = melgan.inference(mel)

    max_wav_value = 2**(16 - 1)  # For 16bit audio
    wave = wave.float() / max_wav_value

    return wave.cpu()
Ejemplo n.º 9
0
def magphase(y, config):
    window = torch.hann_window(config.win_size)
    y, window = set_device((y, window), config.device)
    stft_matrix = torch.stft(y, 
                             n_fft=config.fft_size, 
                             hop_length=config.hop_size, 
                             win_length=config.fft_size,
                             window=window)
    
    real = stft_matrix[...,0]
    imag = stft_matrix[...,1]

    magnitude = torch.sqrt(real**2 + imag**2)
    phase = torch.atan2(imag.data, real.data)

    return magnitude, phase
Ejemplo n.º 10
0
def melspectrogram(y, config, squeeze=True):
    spec = stft(y, config)
    mel_filter = filters.mel(sr=config.sample_rate,
                             n_fft=config.fft_size,
                             n_mels=config.mel_size,
                             fmin=config.mel_fmin,
                             fmax=config.mel_fmax)
    mel_filter = torch.from_numpy(mel_filter)
    mel_filter = set_device(mel_filter, config.device)
    mel = torch.matmul(mel_filter, spec)
    if config.norm_type == 'db':
        mel = normalize(amp2db(mel), config.min_level_db)
    elif config.norm_type == 'log':
        min_level = db2amp(config.min_level_db)
        mel = torch.log(torch.clamp(mel, min=min_level))
        
    if squeeze:
        mel = mel.squeeze(0)

    return mel
Ejemplo n.º 11
0
def main():
    config = ConfigXT()
    load = FileXT(config.audio_path)

    print(
        colored('Preprocessing audio for ', 'blue', attrs=['bold']) +
        load.basename)
    y = dsp.load(load.filename, config.sample_rate)
    mel = dsp.melspectrogram(y, config, squeeze=False)
    mel = set_device(mel, config.device)

    if config.vocoder == 'wavernn':
        wave = wavernn_infer(mel, config)
    elif config.vocoder == 'waveglow':
        wave = waveglow_infer(mel, config)
    elif config.vocoder == 'melgan':
        wave = melgan_infer(mel, config)

    savename = config.vocoder_path.replace('.pt', '_') + load.basename
    torchaudio.save(savename, wave, config.sample_rate)

    print(colored('Audio generated to ', 'blue', attrs=['bold']) + savename)
Ejemplo n.º 12
0
def main():
    config = ConfigXT()
    config_basename = FileXT(config.file).basename
    print("Configuration file: %s" % (config_basename))

    checkpoint_path = config.checkpoint_path
    if not config.test_run:
        checkpoint_path = create_path(config.checkpoint_path)
        config.save(os.path.join(checkpoint_path, config_basename))
        writer = SummaryWriter(checkpoint_path)

    dataloader = dataprocess.load_train(config, model_type='durian')
    model = DurIAN(config)
    model = set_device(model, config.device)
    criterion = torch.nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=config.learn_rate,
                                 weight_decay=config.weight_decay)
    scheduler = StepLR(optimizer,
                       step_size=len(dataloader.train) * config.step_size,
                       gamma=config.factor)

    losses = []
    loss_train = LossLog()
    loss_valid = LossLog()
    for epoch in range(config.stop_epoch):
        # Train Loop
        model.train()
        for batch in tqdm(dataloader.train, leave=False, ascii=True):
            x, y_prev, y = set_device(batch, config.device)

            optimizer.zero_grad()
            y_gen, y_decoder_gen = model(x, y_prev)
            loss = criterion(y_gen, y) + criterion(y_decoder_gen, y)
            loss.backward()
            if config.clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), config.clip_grad_norm)
            optimizer.step()
            scheduler.step()

            loss_train.add(loss.item(), y[0].size(0))
            if not config.test_run:
                writer.add_scalar('train/l1_loss', loss.item(),
                                  loss_train.iteration)

        # Validation Loop
        model.eval()
        for batch in tqdm(dataloader.valid, leave=False, ascii=True):
            x, y_prev, y = set_device(batch, config.device)

            y_gen, y_decoder_gen = model(x, y_prev)
            loss = criterion(y_gen, y) + criterion(y_decoder_gen, y)

            loss_valid.add(loss.item(), y[0].size(0))
            if not config.test_run:
                writer.add_scalar('valid/l1_loss', loss.item(),
                                  loss_valid.iteration)

        learn_rate = scheduler.get_lr()[0]
        print(
            "[Epoch %d/%d] [loss train: %.5f] [loss valid: %.5f] [lr: %.5f]" %
            (epoch, config.stop_epoch, loss_train.avg(), loss_valid.avg(),
             learn_rate))

        losses.append([loss_train.avg(), loss_valid.avg()])
        loss_train.reset()
        loss_valid.reset()

        if not config.test_run:
            loss_savename = os.path.join(checkpoint_path, 'loss.pt')
            torch.save(losses, loss_savename)

            savename = os.path.join(checkpoint_path, 'latest_checkpoint.pt')
            save_checkpoint(savename, model, optimizer, learn_rate,
                            loss_train.iteration)

            if epoch % config.save_epoch == 0:
                savename = os.path.join(checkpoint_path,
                                        'epoch' + str(epoch) + '.pt')
                save_checkpoint(savename, model, optimizer, learn_rate,
                                loss_train.iteration)