def acoustic_infer(data, config): dataloader = dataprocess.load_infer(data, model_type='acoustic') model = Acoustic(config) model.load_state_dict( torch.load(config.acoustic_path, map_location='cpu')['state_dict']) model = set_device(model, config.device) model.eval() print( colored('Generating acoustics with ', 'blue', attrs=['bold']) + config.acoustic_path) f0_min = dsp.midi2hz(config.min_note + 3) f0_min = dsp.f0_normalize(f0_min, config.min_note, config.min_note + config.num_note) f0 = [] rmse = [] y_prev = set_device((torch.zeros(1, 1), torch.zeros(1, 1)), config.device) for batch in tqdm(dataloader, leave=False, ascii=True): x, y_prev, _ = set_device(batch, config.device) f0_gen, rmse_gen = model(x, y_prev) y_prev = (f0_gen, rmse_gen) f0_denoised = f0_gen.squeeze(0).data f0_denoised[f0_denoised < f0_min] = 0 f0.append(f0_denoised) rmse_denoised = rmse_gen.squeeze(0).data rmse_denoised[rmse_denoised < 0] = 0 rmse.append(rmse_denoised) return f0, rmse
def stft(y, config): spec_fn = Spectrogram(n_fft=config.fft_size, win_length=config.win_size, hop_length=config.hop_size) y, spec_fn = set_device((y, spec_fn), config.device) spec = torch.sqrt(spec_fn(y)) return spec
def main(): config = ConfigXT() load = FileXT(config.audio_path) print( colored('Preprocessing audio for ', 'blue', attrs=['bold']) + load.basename) data = preprocess.preprocess(load.filename, config.speaker, config, verbose=False) dataloader = dataprocess.load_infer(data) model = Tacotron(config) model.load_state_dict( torch.load(config.model_path, map_location='cpu')['state_dict']) model = set_device(model, config.device) model.eval() print( colored('Generating mel-spectrogram with ', 'blue', attrs=['bold']) + config.model_path) mel = [] y_prev = set_device(torch.zeros(1, config.mel_size, 1), config.device) for batch in tqdm(dataloader, leave=False, ascii=True): x, y_prev, _ = set_device(batch, config.device) y_gen, _ = model(x, y_prev) mel.append(y_gen.data) y_prev = y_gen[..., -1].unsqueeze(-1) mel = torch.cat(mel, dim=-1) if config.vocoder == 'wavernn': wave = wavernn_infer(mel, config) elif config.vocoder == 'waveglow': wave = waveglow_infer(mel, config) savename = config.model_path.replace('.pt', '_') + FileXT( config.vocoder_path).basestem + '_speaker' + str( config.speaker) + '_' + load.basename torchaudio.save(savename, wave, config.sample_rate) print(colored('Audio generated to ', 'blue', attrs=['bold']) + savename)
def istft(magnitude, phase, config): window = torch.hann_window(config.win_size) stft_matrix = torch.stack((magnitude*torch.cos(phase), magnitude*torch.sin(phase)), dim=-1) stft_matrix, window = set_device((stft_matrix, window), config.device) y = torchaudio.functional.istft(stft_matrix, n_fft=config.fft_size, hop_length=config.hop_size, win_length=config.win_size, window=window) return y
def wavernn_infer(mel, config): print( colored('Running WaveRNN with ', 'blue', attrs=['bold']) + config.vocoder_path) wavernn = WaveRNN(config) wavernn.load_state_dict(torch.load(config.vocoder_path, map_location='cpu')) wavernn = set_device(wavernn, config.device) wave = wavernn.infer(mel) return wave.cpu()
def durian_infer(data, config): dataloader = dataprocess.load_infer(data, model_type='durian') model = DurIAN(config) model.load_state_dict( torch.load(config.durian_path, map_location='cpu')['state_dict']) model = set_device(model, config.device) model.eval() print( colored('Generating mel-spectrogram with ', 'blue', attrs=['bold']) + config.durian_path) mel = [] y_prev = set_device(torch.zeros(1, config.mel_size, 1), config.device) for batch in tqdm(dataloader, leave=False, ascii=True): x, _, _ = set_device(batch, config.device) y_gen, _ = model(x, y_prev) mel.append(y_gen.data) y_prev = y_gen[..., -1].unsqueeze(-1) mel = torch.cat(mel, dim=-1) return mel
def waveglow_infer(mel, config): print( colored('Running WaveGlow with ', 'blue', attrs=['bold']) + config.vocoder_path) waveglow = WaveGlow(config) waveglow, _, _ = load_checkpoint(config.vocoder_path, waveglow) #waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow') waveglow = waveglow.remove_weightnorm(waveglow) waveglow = set_device(waveglow, config.device) waveglow.eval() denoiser = Denoiser(waveglow, config) denoiser = set_device(denoiser, config.device) with torch.no_grad(): wave = waveglow.infer(mel, config.sigma).float() wave = denoiser(wave, strength=config.denoising_strength) wave = wave / torch.max(torch.abs(wave)) return wave.cpu()
def melgan_infer(mel, config): print( colored('Running MelGAN with ', 'blue', attrs=['bold']) + config.vocoder_path) melgan = torch.hub.load('seungwonpark/melgan', 'melgan') melgan = set_device(melgan, config.device) melgan.eval() with torch.no_grad(): wave = melgan.inference(mel) max_wav_value = 2**(16 - 1) # For 16bit audio wave = wave.float() / max_wav_value return wave.cpu()
def magphase(y, config): window = torch.hann_window(config.win_size) y, window = set_device((y, window), config.device) stft_matrix = torch.stft(y, n_fft=config.fft_size, hop_length=config.hop_size, win_length=config.fft_size, window=window) real = stft_matrix[...,0] imag = stft_matrix[...,1] magnitude = torch.sqrt(real**2 + imag**2) phase = torch.atan2(imag.data, real.data) return magnitude, phase
def melspectrogram(y, config, squeeze=True): spec = stft(y, config) mel_filter = filters.mel(sr=config.sample_rate, n_fft=config.fft_size, n_mels=config.mel_size, fmin=config.mel_fmin, fmax=config.mel_fmax) mel_filter = torch.from_numpy(mel_filter) mel_filter = set_device(mel_filter, config.device) mel = torch.matmul(mel_filter, spec) if config.norm_type == 'db': mel = normalize(amp2db(mel), config.min_level_db) elif config.norm_type == 'log': min_level = db2amp(config.min_level_db) mel = torch.log(torch.clamp(mel, min=min_level)) if squeeze: mel = mel.squeeze(0) return mel
def main(): config = ConfigXT() load = FileXT(config.audio_path) print( colored('Preprocessing audio for ', 'blue', attrs=['bold']) + load.basename) y = dsp.load(load.filename, config.sample_rate) mel = dsp.melspectrogram(y, config, squeeze=False) mel = set_device(mel, config.device) if config.vocoder == 'wavernn': wave = wavernn_infer(mel, config) elif config.vocoder == 'waveglow': wave = waveglow_infer(mel, config) elif config.vocoder == 'melgan': wave = melgan_infer(mel, config) savename = config.vocoder_path.replace('.pt', '_') + load.basename torchaudio.save(savename, wave, config.sample_rate) print(colored('Audio generated to ', 'blue', attrs=['bold']) + savename)
def main(): config = ConfigXT() config_basename = FileXT(config.file).basename print("Configuration file: %s" % (config_basename)) checkpoint_path = config.checkpoint_path if not config.test_run: checkpoint_path = create_path(config.checkpoint_path) config.save(os.path.join(checkpoint_path, config_basename)) writer = SummaryWriter(checkpoint_path) dataloader = dataprocess.load_train(config, model_type='durian') model = DurIAN(config) model = set_device(model, config.device) criterion = torch.nn.L1Loss() optimizer = torch.optim.Adam(model.parameters(), lr=config.learn_rate, weight_decay=config.weight_decay) scheduler = StepLR(optimizer, step_size=len(dataloader.train) * config.step_size, gamma=config.factor) losses = [] loss_train = LossLog() loss_valid = LossLog() for epoch in range(config.stop_epoch): # Train Loop model.train() for batch in tqdm(dataloader.train, leave=False, ascii=True): x, y_prev, y = set_device(batch, config.device) optimizer.zero_grad() y_gen, y_decoder_gen = model(x, y_prev) loss = criterion(y_gen, y) + criterion(y_decoder_gen, y) loss.backward() if config.clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), config.clip_grad_norm) optimizer.step() scheduler.step() loss_train.add(loss.item(), y[0].size(0)) if not config.test_run: writer.add_scalar('train/l1_loss', loss.item(), loss_train.iteration) # Validation Loop model.eval() for batch in tqdm(dataloader.valid, leave=False, ascii=True): x, y_prev, y = set_device(batch, config.device) y_gen, y_decoder_gen = model(x, y_prev) loss = criterion(y_gen, y) + criterion(y_decoder_gen, y) loss_valid.add(loss.item(), y[0].size(0)) if not config.test_run: writer.add_scalar('valid/l1_loss', loss.item(), loss_valid.iteration) learn_rate = scheduler.get_lr()[0] print( "[Epoch %d/%d] [loss train: %.5f] [loss valid: %.5f] [lr: %.5f]" % (epoch, config.stop_epoch, loss_train.avg(), loss_valid.avg(), learn_rate)) losses.append([loss_train.avg(), loss_valid.avg()]) loss_train.reset() loss_valid.reset() if not config.test_run: loss_savename = os.path.join(checkpoint_path, 'loss.pt') torch.save(losses, loss_savename) savename = os.path.join(checkpoint_path, 'latest_checkpoint.pt') save_checkpoint(savename, model, optimizer, learn_rate, loss_train.iteration) if epoch % config.save_epoch == 0: savename = os.path.join(checkpoint_path, 'epoch' + str(epoch) + '.pt') save_checkpoint(savename, model, optimizer, learn_rate, loss_train.iteration)