def save_states(global_step, writer, y_hat, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) idx = np.random.randint(0, len(y_hat)) length = input_lengths[idx].data.cpu().item() # (B, C, T) if y_hat.dim() == 4: y_hat = y_hat.squeeze(-1) if is_mulaw_quantize(hparams.input_type) or is_linear_quantize( hparams.input_type): # (B, T) y_hat = F.softmax(y_hat, dim=1).max(1)[1] # (T,) y_hat = y_hat[idx].data.cpu().long().numpy() y = y[idx].view(-1).data.cpu().long().numpy() if is_mulaw_quantize(hparams.input_type): y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1) y = P.inv_mulaw_quantize(y, hparams.quantize_channels - 1) elif is_linear_quantize(hparams.input_type): y_hat = inv_linear_quantize(y_hat, hparams.quantize_channels - 1) y = inv_linear_quantize(y, hparams.quantize_channels - 1) else: # (B, T) if hparams.output_distribution == "Logistic": y_hat = sample_from_discretized_mix_logistic( y_hat, log_scale_min=hparams.log_scale_min) elif hparams.output_distribution == "Normal": y_hat = sample_from_mix_gaussian( y_hat, log_scale_min=hparams.log_scale_min) else: assert False # (T,) y_hat = y_hat[idx].view(-1).data.cpu().numpy() y = y[idx].view(-1).data.cpu().numpy() if is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels) y = P.inv_mulaw(y, hparams.quantize_channels) # Mask by length y_hat[length:] = 0 y[length:] = 0 # Save audio audio_dir = join(checkpoint_dir, "intermediate", "audio") os.makedirs(audio_dir, exist_ok=True) path = join(audio_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y, sr=hparams.sample_rate)
def build_model(name='teacher'): if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'" ) if hparams.upsample_conditional_features and hparams.cin_channels < 0: s = "Upsample conv layers were specified while local conditioning disabled. " s += "Notice that upsample conv layers will never be used." warn(s) if name == 'teacher': return getattr(builder, hparams.builder)( out_channels=hparams.out_channels, layers=hparams.layers, stacks=hparams.stacks, residual_channels=hparams.residual_channels, gate_channels=hparams.gate_channels, skip_out_channels=hparams.skip_out_channels, cin_channels=hparams.cin_channels, gin_channels=hparams.gin_channels, weight_normalization=hparams.weight_normalization, n_speakers=hparams.n_speakers, dropout=hparams.dropout, kernel_size=hparams.kernel_size, upsample_conditional_features=hparams. upsample_conditional_features, upsample_scales=hparams.upsample_scales, freq_axis_kernel_size=hparams.freq_axis_kernel_size, scalar_input=is_scalar_input(hparams.input_type), ) else: return StudentWaveNet()
def build_model(): if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'" ) model = getattr(builder, hparams.builder)( out_channels=hparams.out_channels, layers=hparams.layers, stacks=hparams.stacks, residual_channels=hparams.residual_channels, gate_channels=hparams.gate_channels, skip_out_channels=hparams.skip_out_channels, cin_channels=hparams.cin_channels, gin_channels=hparams.gin_channels, weight_normalization=hparams.weight_normalization, n_speakers=hparams.n_speakers, dropout=hparams.dropout, kernel_size=hparams.kernel_size, upsample_conditional_features=hparams.upsample_conditional_features, upsample_scales=hparams.upsample_scales, freq_axis_kernel_size=hparams.freq_axis_kernel_size, scalar_input=is_scalar_input(hparams.input_type), ) return model
def build_model(): if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'") if hparams.upsample_conditional_features and hparams.cin_channels < 0: s = "Upsample conv layers were specified while local conditioning disabled. " s += "Notice that upsample conv layers will never be used." warn(s) upsample_params = hparams.upsample_params upsample_params["cin_channels"] = hparams.cin_channels upsample_params["cin_pad"] = hparams.cin_pad model = WaveNet( out_channels=hparams.out_channels, layers=hparams.layers, stacks=hparams.stacks, residual_channels=hparams.residual_channels, gate_channels=hparams.gate_channels, skip_out_channels=hparams.skip_out_channels, cin_channels=hparams.cin_channels, gin_channels=hparams.gin_channels, n_speakers=hparams.n_speakers, dropout=hparams.dropout, kernel_size=hparams.kernel_size, cin_pad=hparams.cin_pad, upsample_conditional_features=hparams.upsample_conditional_features, upsample_params=upsample_params, scalar_input=is_scalar_input(hparams.input_type), output_distribution=hparams.output_distribution, ) return model
def _process_utterance(out_dir, wav_path): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T return mel_spectrogram.astype(np.float32)
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def build_vqvae_model(): if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'" ) if hparams.upsample_conditional_features and hparams.cin_channels < 0: s = "Upsample conv layers were specified while local conditioning disabled. " s += "Notice that upsample conv layers will never be used." warn(s) upsample_params = hparams.upsample_params upsample_params["cin_channels"] = hparams.cin_channels upsample_params["cin_pad"] = hparams.cin_pad wavenet = WaveNet( out_channels=hparams.out_channels, layers=hparams.layers, stacks=hparams.stacks, residual_channels=hparams.residual_channels, gate_channels=hparams.gate_channels, skip_out_channels=hparams.skip_out_channels, cin_channels=hparams.cin_channels, gin_channels=hparams.gin_channels, n_speakers=hparams.n_speakers, dropout=hparams.dropout, kernel_size=hparams.kernel_size, cin_pad=hparams.cin_pad, upsample_conditional_features=hparams.upsample_conditional_features, upsample_params=upsample_params, scalar_input=is_scalar_input(hparams.input_type), output_distribution=hparams.output_distribution, use_speaker_embedding=True, ) if hparams.use_K1 and hparams.K1 != hparams.K: K1 = hparams.K1 else: K1 = None if hparams.post_conv: hid = 64 else: hid = hparams.cin_channels model = VQVAE(wavenet=wavenet, c_in=39, hid=hid, frame_rate=hparams.frame_rate, use_time_jitter=hparams.time_jitter, K=hparams.K, ema=hparams.ema, sliced=hparams.sliced, ins_norm=hparams.ins_norm, post_conv=hparams.post_conv, adain=hparams.adain, dropout=hparams.vq_drop, drop_dim=hparams.drop_dim, K1=K1, num_slices=hparams.num_slices) return model
def train_loop(model, data_loaders, optimizer, writer, checkpoint_dir=None, scheduler=None): if use_cuda: model = model.cuda() if is_mulaw_quantize(hparams.input_type): criterion = MaskedCrossEntropyLoss() else: criterion = DiscretizedMixturelogisticLoss() if hparams.exponential_moving_average: ema = ExponentialMovingAverage(hparams.ema_decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) else: ema = None global global_step, global_epoch, global_test_step while global_epoch < hparams.nepochs: for phase, data_loader in data_loaders.items(): train = (phase == "train") running_loss = 0. test_evaluated = False for step, (x, y, c, g, input_lengths) in tqdm(enumerate(data_loader)): # Whether to save eval (i.e., online decoding) result do_eval = False eval_dir = join(checkpoint_dir, "{}_eval".format(phase)) # Do eval per eval_interval for train if train and global_step > 0 \ and global_step % hparams.train_eval_interval == 0: do_eval = True # Do eval for test # NOTE: Decoding WaveNet is quite time consuming, so # do only once in a single epoch for testset if not train and not test_evaluated \ and global_epoch % hparams.test_eval_epoch_interval == 0: do_eval = True test_evaluated = True if do_eval: print("[{}] Eval at train step {}".format(phase, global_step)) # Do step running_loss += __train_step( phase, global_epoch, global_step, global_test_step, model, optimizer, writer, criterion, x, y, c, g, input_lengths, checkpoint_dir, eval_dir, do_eval, ema, scheduler=scheduler) # update global state if train: global_step += 1 else: global_test_step += 1 # log per epoch averaged_loss = running_loss / len(data_loader) writer.add_scalar("{} loss (per epoch)".format(phase),averaged_loss.item(), global_epoch) print("Step {} [{}] Loss: {}".format(global_step, phase, running_loss / len(data_loader))) global_epoch += 1
def batch_wavegen(model, c=None, g=None, fast=True, tqdm=tqdm, length=None, writing_dir=None): from train import sanity_check sanity_check(model, c, g) # assert c is not None if c is not None: B = c.shape[0] else: B = 1 #c.shape[0] model.eval() if fast: model.make_generation_fast_() # Transform data to GPU g = None if g is None else g.to(device) c = None if c is None else c.to(device) if hparams.upsample_conditional_features and length is None: length = (c.shape[-1] - hparams.cin_pad * 2) * audio.get_hop_size() with torch.no_grad(): y_hat = model.incremental_forward( c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) y_hat_sample = y_hat.max(1)[1].view(B, -1).float() cross_entropy = model.binary_softmax_loss(y_hat_sample.unsqueeze(1), c) # Write the output with open(join(writing_dir, "info.json"), "w") as f: data = {"0.244" : float(cross_entropy.detach().cpu().numpy())} json.dump(data, f, indent=4) if is_mulaw_quantize(hparams.input_type): # needs to be float since mulaw_inv returns in range of [-1, 1] y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy() for i in range(B): y_hat[i] = P.inv_mulaw_quantize(y_hat[i], hparams.quantize_channels - 1) elif is_linear_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy() for i in range(B): y_hat[i] = inv_linear_quantize(y_hat[i], hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): y_hat = y_hat.view(B, -1).cpu().data.numpy() for i in range(B): y_hat[i] = P.inv_mulaw(y_hat[i], hparams.quantize_channels - 1) else: y_hat = y_hat.view(B, -1).cpu().data.numpy() if hparams.postprocess is not None and hparams.postprocess not in ["", "none"]: for i in range(B): y_hat[i] = getattr(audio, hparams.postprocess)(y_hat[i]) if hparams.global_gain_scale > 0: for i in range(B): y_hat[i] /= hparams.global_gain_scale return y_hat
def build_model(): if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'" ) if hparams.upsample_conditional_features and hparams.cin_channels < 0: s = "Upsample conv layers were specified while local conditioning disabled. " s += "Notice that upsample conv layers will never be used." warn(s) upsample_params = hparams.upsample_params upsample_params["cin_channels"] = hparams.cin_channels upsample_params["cin_pad"] = hparams.cin_pad if hparams.name == 'new_inae': use_speaker_embedding = False else: use_speaker_embedding = True wavenet = WaveNet( out_channels=hparams.out_channels, layers=hparams.layers, stacks=hparams.stacks, residual_channels=hparams.residual_channels, gate_channels=hparams.gate_channels, skip_out_channels=hparams.skip_out_channels, cin_channels=hparams.cin_channels, gin_channels=hparams.gin_channels, n_speakers=hparams.n_speakers, dropout=hparams.dropout, kernel_size=hparams.kernel_size, cin_pad=hparams.cin_pad, upsample_conditional_features=hparams.upsample_conditional_features, upsample_params=upsample_params, scalar_input=is_scalar_input(hparams.input_type), output_distribution=hparams.output_distribution, use_speaker_embedding=use_speaker_embedding, ) if hparams.name == 'inae': model = INAE(wavenet=wavenet, c_in=39, hid=64, frame_rate=hparams.frame_rate, adain=hparams.adain) elif hparams.name == 'inae1': model = INAE1(wavenet=wavenet, c_in=39, hid=64, frame_rate=hparams.frame_rate, adain=hparams.adain) elif hparams.name == 'new_inae': model = NewINAE(wavenet=wavenet, c_in=39, hid=64, frame_rate=hparams.frame_rate) return model
def batch_wavegen(model, c=None, g=None, fast=True, tqdm=tqdm, length=None): from train import sanity_check sanity_check(model, c, g) # assert c is not None if c is not None: B = c.shape[0] else: B = 1 #c.shape[0] model.eval() if fast: model.make_generation_fast_() # Transform data to GPU g = None if g is None else g.to(device) c = None if c is None else c.to(device) if hparams.upsample_conditional_features and length is None: length = (c.shape[-1] - hparams.cin_pad * 2) * audio.get_hop_size() with torch.no_grad(): y_hat = model.incremental_forward(c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): # needs to be float since mulaw_inv returns in range of [-1, 1] y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy() for i in range(B): y_hat[i] = P.inv_mulaw_quantize(y_hat[i], hparams.quantize_channels - 1) elif is_linear_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy() for i in range(B): y_hat[i] = inv_linear_quantize(y_hat[i], hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): y_hat = y_hat.view(B, -1).cpu().data.numpy() for i in range(B): y_hat[i] = P.inv_mulaw(y_hat[i], hparams.quantize_channels - 1) else: y_hat = y_hat.view(B, -1).cpu().data.numpy() if hparams.postprocess is not None and hparams.postprocess not in [ "", "none" ]: for i in range(B): y_hat[i] = getattr(audio, hparams.postprocess)(y_hat[i]) if hparams.global_gain_scale > 0: for i in range(B): y_hat[i] /= hparams.global_gain_scale return y_hat
def save_states(global_step, writer, y_hat, y, y_student,scale_tot, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) idx = np.random.randint(0, len(y_hat)) length = input_lengths[idx].data.cpu().numpy() # (B, C, T) if y_hat.dim() == 4: y_hat = y_hat.squeeze(-1) if is_mulaw_quantize(hparams.input_type): # (B, T) y_hat = F.softmax(y_hat, dim=1).max(1)[1] # (T,) y_hat = y_hat[idx].data.cpu().long().numpy() y = y[idx].view(-1).data.cpu().long().numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y = P.inv_mulaw_quantize(y, hparams.quantize_channels) else: # (B, T) scale = y_hat[:,1:,:] teacher_log_scale = scale.data.cpu().numpy() student_log_scale = torch.log(scale_tot).data.cpu().numpy() writer.add_histogram('log_teacher_scale', teacher_log_scale, global_step) writer.add_histogram('log_student_scale', student_log_scale, global_step) y_hat = sample_from_discretized_gaussian( y_hat, log_scale_min=hparams.log_scale_min) # (T,) y_hat = y_hat[idx].view(-1).data.cpu().numpy() y = y[idx].view(-1).data.cpu().numpy() if is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels) y = P.inv_mulaw(y, hparams.quantize_channels) # Mask by length y_hat[length:] = 0 y[length:] = 0 y_student = y_student[idx].view(-1).data.cpu().numpy() y_student[length:] = 0 # Save audio audio_dir = join(checkpoint_dir, "audio") os.makedirs(audio_dir, exist_ok=True) path = join(audio_dir, "step{:09d}_teacher_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_student_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_student, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}.jpg".format(global_step)) save_waveplot(path,y_teacher=y_hat,y_student=y_student,y_target=y,writer=writer,global_step=global_step)
def create_model(name, hparams, init=False): if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'" ) if name == 'WaveNet': return WaveNet(hparams, init) else: raise Exception('Unknow model: {}'.format(name))
def batch_wavegen(hparam, net, c_input=None, g_input=None, tqdm_=None, is_numpy=True): """ generate audio """ assert c_input is not None B = c_input.shape[0] net.set_train(False) if hparam.upsample_conditional_features: length = (c_input.shape[-1] - hparam.cin_pad * 2) * audio.get_hop_size() else: # already dupulicated length = c_input.shape[-1] y_hat = net.incremental_forward(c=c_input, g=g_input, T=length, tqdm=tqdm_, softmax=True, quantize=True, log_scale_min=hparam.log_scale_min, is_numpy=is_numpy) if is_mulaw_quantize(hparam.input_type): # needs to be float since mulaw_inv returns in range of [-1, 1] y_hat = np.reshape(np.argmax(y_hat, 1), (B, -1)) y_hat = y_hat.astype(np.float32) for k in range(B): y_hat[k] = P.inv_mulaw_quantize(y_hat[k], hparam.quantize_channels - 1) elif is_mulaw(hparam.input_type): y_hat = np.reshape(y_hat, (B, -1)) for k in range(B): y_hat[k] = P.inv_mulaw(y_hat[k], hparam.quantize_channels - 1) else: y_hat = np.reshape(y_hat, (B, -1)) if hparam.postprocess is not None and hparam.postprocess not in [ "", "none" ]: for k in range(B): y_hat[k] = getattr(audio, hparam.postprocess)(y_hat[k]) if hparam.global_gain_scale > 0: for k in range(B): y_hat[k] /= hparam.global_gain_scale return y_hat
def save_states(global_step, writer, y_hat, y, y_student, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) idx = np.random.randint(0, len(y_hat)) length = input_lengths[idx].data.cpu().numpy() # (B, C, T) if y_hat.dim() == 4: y_hat = y_hat.squeeze(-1) if is_mulaw_quantize(hparams.input_type): # (B, T) y_hat = F.softmax(y_hat, dim=1).max(1)[1] # (T,) y_hat = y_hat[idx].data.cpu().long().numpy() y = y[idx].view(-1).data.cpu().long().numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y = P.inv_mulaw_quantize(y, hparams.quantize_channels) else: # (B, T) y_hat = sample_from_discretized_mix_logistic( y_hat, log_scale_min=hparams.log_scale_min) # (T,) y_hat = y_hat[idx].view(-1).data.cpu().numpy() y = y[idx].view(-1).data.cpu().numpy() if is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels) y = P.inv_mulaw(y, hparams.quantize_channels) # Mask by length y_hat[length:] = 0 y[length:] = 0 y_student = y_student.cpu().data.numpy() # Save audio audio_dir = join(checkpoint_dir, "audio") os.makedirs(audio_dir, exist_ok=True) path = join(audio_dir, "step{:09d}_teacher.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_student.wav".format(global_step)) y_student = y_student[idx].reshape(y_student.shape[-1]) librosa.output.write_wav(path, y_student, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_wave.png".format(global_step)) save_waveplot(path, y_student, y, y_hat)
def _extract_mel(wav_path): # Load the audio to a numpy array. Resampled if needed. wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert len(out) // N == audio.get_hop_size() timesteps = len(out) return out, mel_spectrogram, timesteps, out_dtype
def save_ref_audio(hparam, ref, length, target_wav_path_): """ save reference audio """ if is_mulaw_quantize(hparam.input_type): ref = np.reshape(np.argmax(ref, 0), (-1))[:length] ref = ref.astype(np.float32) else: ref = np.reshape(ref, (-1))[:length] if is_mulaw_quantize(hparam.input_type): ref = P.inv_mulaw_quantize(ref, hparam.quantize_channels - 1) elif is_mulaw(hparam.input_type): ref = P.inv_mulaw(ref, hparam.quantize_channels - 1) if hparam.postprocess is not None and hparam.postprocess not in ["", "none"]: ref = getattr(audio, hparam.postprocess)(ref) if hparam.global_gain_scale > 0: ref /= hparam.global_gain_scale ref = np.clip(ref, -1.0, 1.0) wavfile.write(target_wav_path_, hparam.sample_rate, to_int16(ref))
def create_model(name, hparams): if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'") if hparams.upsample_conditional_features and hparams.cin_channels < 0: s = "Upsample conv layers were specified while local conditioning disabled. " s += "Notice that upsample conv layers will never be used." warn(s) if name == 'WaveNet': return WaveNet(hparams) else: raise Exception('Unknow model: {}'.format(name))
def build_catae_model(): if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'" ) if hparams.upsample_conditional_features and hparams.cin_channels < 0: s = "Upsample conv layers were specified while local conditioning disabled. " s += "Notice that upsample conv layers will never be used." warn(s) upsample_params = hparams.upsample_params upsample_params["cin_channels"] = hparams.cin_channels upsample_params["cin_pad"] = hparams.cin_pad wavenet = WaveNet( out_channels=hparams.out_channels, layers=hparams.layers, stacks=hparams.stacks, residual_channels=hparams.residual_channels, gate_channels=hparams.gate_channels, skip_out_channels=hparams.skip_out_channels, cin_channels=hparams.cin_channels, gin_channels=hparams.gin_channels, n_speakers=hparams.n_speakers, dropout=hparams.dropout, kernel_size=hparams.kernel_size, cin_pad=hparams.cin_pad, upsample_conditional_features=hparams.upsample_conditional_features, upsample_params=upsample_params, scalar_input=is_scalar_input(hparams.input_type), output_distribution=hparams.output_distribution, use_speaker_embedding=True, ) model = CatWavAE(wavenet=wavenet, c_in=39, hid=hparams.cin_channels, tau=0.1, k=hparams.K, frame_rate=hparams.frame_rate, hard=hparams.hard, slices=hparams.num_slices) return model
def __init__(self, network, hparams): super(NetWithLossClass, self).__init__(auto_prefix=False) self.network = network self.hparams = hparams self.ReduceMean_false = P.ReduceMean(keep_dims=False) self.expand_op = P.ExpandDims() self.transpose_op = P.Transpose() self.reshape_op = P.Reshape() self.is_mulaw_quant = is_mulaw_quantize(hparams.input_type) if self.is_mulaw_quant: self.criterion = MaskedCrossEntropyLoss() else: if hparams.output_distribution == "Logistic": self.criterion = DiscretizedMixturelogisticLoss(hparams) elif hparams.output_distribution == "Normal": self.criterion = MixtureGaussianLoss(hparams) else: self.criterion = None raise RuntimeError( "Not supported output distribution type: {}".format( hparams.output_distribution))
def build_model(hparams_json=None): if hparams_json is not None: with open(hparams_json, 'r') as jf: hparams = HParams(**json.load(jf)) if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'") if hparams.upsample_conditional_features and hparams.cin_channels < 0: s = "Upsample conv layers were specified while local conditioning disabled. " s += "Notice that upsample conv layers will never be used." warn(s) upsample_params = hparams.upsample_params upsample_params["cin_channels"] = hparams.cin_channels upsample_params["cin_pad"] = hparams.cin_pad use_speaker_embedding = True if hparams.gin_channels > 0 else False model = WaveNet( out_channels=hparams.out_channels, layers=hparams.layers, stacks=hparams.stacks, residual_channels=hparams.residual_channels, gate_channels=hparams.gate_channels, skip_out_channels=hparams.skip_out_channels, cin_channels=hparams.cin_channels, gin_channels=hparams.gin_channels, n_speakers=hparams.n_speakers, dropout=hparams.dropout, kernel_size=hparams.kernel_size, cin_pad=hparams.cin_pad, upsample_conditional_features=hparams.upsample_conditional_features, upsample_net=hparams.upsample_net, upsample_params=upsample_params, scalar_input=is_scalar_input(hparams.input_type), use_speaker_embedding=use_speaker_embedding, output_distribution=hparams.output_distribution, ) return model
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then this is determined by the feature size. c (numpy.ndarray): Conditional features, of shape T x C g (scaler): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray : Generated waveform samples """ from train import sanity_check sanity_check(model, c, g) c = _to_numpy(c) g = _to_numpy(g) if use_cuda: model = model.cuda() model.eval() if fast: model.make_generation_fast_() if c is None: assert length is not None else: # (Tc, D) assert c.ndim == 2 Tc = c.shape[0] upsample_factor = audio.get_hop_size() # Overwrite length according to feature size length = Tc * upsample_factor # (Tc, D) -> (Tc', D) # Repeat features before feeding it to the network if not hparams.upsample_conditional_features: c = np.repeat(c, upsample_factor, axis=0) # B x C x T c = Variable(torch.FloatTensor(c.T).unsqueeze(0)) if initial_value is None: if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input)).view( 1, 1, hparams.quantize_channels) else: initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value) g = None if g is None else Variable(torch.LongTensor([g])) if use_cuda: initial_input = initial_input.cuda() g = None if g is None else g.cuda() c = None if c is None else c.cuda() y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() return y_hat
def _process_utterance(out_dir, index, wav_path, text, trim_silence=False): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim begin/end silences # NOTE: the threshold was chosen for clean signals # TODO: Remove, get this out of here. if trim_silence: wav, _ = librosa.effects.trim(wav, top_db=60, frame_length=2048, hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.logmelspectrogram(wav).astype(np.float32).T if hparams.global_gain_scale > 0: wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in [ "", "none" ]: f = getattr(audio, hparams.preprocess) wav = f(wav) # Clip if np.abs(wav).max() > 1.0: print("""Warning: abs max value exceeds 1.0: {}""".format( np.abs(wav).max())) # ignore this sample return ("dummy", "dummy", -1, "dummy") wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert_ready_for_upsampling(out, mel_spectrogram, cin_pad=0, debug=True) # Write the spectrograms to disk: name = splitext(basename(wav_path))[0] audio_filename = "%s-wave.npy" % (name) mel_filename = "%s-feats.npy" % (name) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save( os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False, ) # Return a tuple describing this training example: return (audio_filename, mel_filename, N, text)
print(hparams_debug_string()) fs = hparams.sample_rate os.makedirs(args.checkpoint_dir, exist_ok=True) output_json_path = join(args.checkpoint_dir, "hparams.json") with open(output_json_path, "w") as f: json.dump(hparams.values(), f, indent=2) data_loaders = get_data_loaders(args.data_path, args.speaker_id, hparams=hparams, rank_id=rank_id, group_size=group_size) step_size_per_epoch = data_loaders.get_dataset_size() if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'" ) if hparams.upsample_conditional_features and hparams.cin_channels < 0: s = "Upsample conv layers were specified while local conditioning disabled. " s += "Notice that upsample conv layers will never be used." warn(s) upsample_params = hparams.upsample_params upsample_params["cin_channels"] = hparams.cin_channels upsample_params["cin_pad"] = hparams.cin_pad model = WaveNet( out_channels=hparams.out_channels, layers=hparams.layers,
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def collate_fn(batch): """Create batch Args: batch(tuple): List of tuples - x[0] (ndarray,int) : list of (T,) - x[1] (ndarray,int) : list of (T, D) - x[2] (ndarray,int) : list of (1,), speaker id Returns: tuple: Tuple of batch - x (FloatTensor) : Network inputs (B, C, T) - y (LongTensor) : Network targets (B, T, 1) """ local_conditioning = len(batch[0]) >= 2 and hparams.cin_channels > 0 global_conditioning = len(batch[0]) >= 3 and hparams.gin_channels > 0 # To save GPU memory... I don't want to do this though if hparams.max_time_sec is not None: max_time_steps = int(hparams.max_time_sec * hparams.sample_rate) elif hparams.max_time_steps is not None: max_time_steps = hparams.max_time_steps else: max_time_steps = None # Time resolution adjastment if local_conditioning: new_batch = [] for idx in range(len(batch)): x, c, g = batch[idx] if hparams.upsample_conditional_features: assert_ready_for_upsampling(x, c) if max_time_steps is not None: max_steps = ensure_divisible(max_time_steps, audio.get_hop_size(), True) if len(x) > max_steps: max_time_frames = max_steps // audio.get_hop_size() s = np.random.randint(0, len(c) - max_time_frames) ts = s * audio.get_hop_size() x = x[ts:ts + audio.get_hop_size() * max_time_frames] c = c[s:s + max_time_frames, :] assert_ready_for_upsampling(x, c) else: x, c = audio.adjast_time_resolution(x, c) if max_time_steps is not None and len(x) > max_time_steps: s = np.random.randint(0, len(x) - max_time_steps) x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :] assert len(x) == len(c) new_batch.append((x, c, g)) batch = new_batch else: new_batch = [] for idx in range(len(batch)): x, c, g = batch[idx] x = audio.trim(x) if max_time_steps is not None and len(x) > max_time_steps: s = np.random.randint(0, len(x) - max_time_steps) if local_conditioning: x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :] else: x = x[s:s + max_time_steps] new_batch.append((x, c, g)) batch = new_batch # Lengths input_lengths = [len(x[0]) for x in batch] max_input_len = max(input_lengths) # (B, T, C) # pad for time-axis if is_mulaw_quantize(hparams.input_type): x_batch = np.array([ _pad_2d( np_utils.to_categorical(x[0], num_classes=hparams.quantize_channels), max_input_len) for x in batch ], dtype=np.float32) else: x_batch = np.array( [_pad_2d(x[0].reshape(-1, 1), max_input_len) for x in batch], dtype=np.float32) assert len(x_batch.shape) == 3 # (B, T) if is_mulaw_quantize(hparams.input_type): y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.int) else: y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.float32) assert len(y_batch.shape) == 2 # (B, T, D) if local_conditioning: max_len = max([len(x[1]) for x in batch]) c_batch = np.array([_pad_2d(x[1], max_len) for x in batch], dtype=np.float32) assert len(c_batch.shape) == 3 # (B x C x T) c_batch = torch.FloatTensor(c_batch).transpose(1, 2).contiguous() else: c_batch = None if global_conditioning: g_batch = torch.LongTensor([x[2] for x in batch]) else: g_batch = None # Covnert to channel first i.e., (B, C, T) x_batch = torch.FloatTensor(x_batch).transpose(1, 2).contiguous() # Add extra axis if is_mulaw_quantize(hparams.input_type): y_batch = torch.LongTensor(y_batch).unsqueeze(-1).contiguous() else: y_batch = torch.FloatTensor(y_batch).unsqueeze(-1).contiguous() input_lengths = torch.LongTensor(input_lengths) return x_batch, y_batch, c_batch, g_batch, input_lengths
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'cmu_arctic-audio-%05d.npy' % index mel_filename = 'cmu_arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def __train_step(phase, epoch, global_step, global_test_step, model, optimizer, writer, criterion, x, y, c, g, input_lengths, checkpoint_dir, eval_dir=None, do_eval=False, ema=None): sanity_check(model, c, g) # x : (B, C, T) # y : (B, T, 1) # c : (B, C, T) # g : (B,) train = (phase == "train") clip_thresh = hparams.clip_thresh if train: model.train() step = global_step else: model.eval() step = global_test_step # Learning rate schedule current_lr = hparams.initial_learning_rate if train and hparams.lr_schedule is not None: lr_schedule_f = getattr(lrschedule, hparams.lr_schedule) current_lr = lr_schedule_f(hparams.initial_learning_rate, step, **hparams.lr_schedule_kwargs) for param_group in optimizer.param_groups: param_group['lr'] = current_lr optimizer.zero_grad() # Prepare data x, y = Variable(x), Variable(y, requires_grad=False) c = Variable(c) if c is not None else None g = Variable(g) if g is not None else None input_lengths = Variable(input_lengths) if use_cuda: x, y = x.cuda(), y.cuda() input_lengths = input_lengths.cuda() c = c.cuda() if c is not None else None g = g.cuda() if g is not None else None # (B, T, 1) mask = sequence_mask(input_lengths, max_len=x.size(-1)).unsqueeze(-1) mask = mask[:, 1:, :] # Apply model: Run the model in regular eval mode # NOTE: softmax is handled in F.cross_entrypy_loss # y_hat: (B x C x T) y_hat = model(x, c=c, g=g, softmax=False) if is_mulaw_quantize(hparams.input_type): # wee need 4d inputs for spatial cross entropy loss # (B, C, T, 1) y_hat = y_hat.unsqueeze(-1) loss = criterion(y_hat[:, :, :-1, :], y[:, 1:, :], mask=mask) else: loss = criterion(y_hat[:, :, :-1], y[:, 1:, :], mask=mask) if train and step > 0 and step % hparams.checkpoint_interval == 0: save_states(step, writer, y_hat, y, input_lengths, checkpoint_dir) save_checkpoint(model, optimizer, step, checkpoint_dir, epoch, ema) if do_eval: # NOTE: use train step (i.e., global_step) for filename eval_model(global_step, writer, model, y, c, g, input_lengths, eval_dir, ema) # Update if train: loss.backward() if clip_thresh > 0: grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), clip_thresh) optimizer.step() # update moving average if ema is not None: for name, param in model.named_parameters(): if name in ema.shadow: ema.update(name, param.data) # Logs writer.add_scalar("{} loss".format(phase), float(loss.data[0]), step) if train: if clip_thresh > 0: writer.add_scalar("gradient norm", grad_norm, step) writer.add_scalar("learning rate", current_lr, step) return loss.data[0]
def eval_model(global_step, writer, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(model, ema) model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().numpy()[0] # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 print("Intial value:", initial_value) # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input)).view( 1, 1, hparams.quantize_channels) else: initial_input = Variable(torch.zeros(1, 1, 1).fill_(initial_value)) initial_input = initial_input.cuda() if use_cuda else initial_input # Run the model in fast eval mode y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then this is determined by the feature size. c (numpy.ndarray): Conditional features, of shape T x C g (scaler): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray : Generated waveform samples """ from train import sanity_check sanity_check(model, c, g) c = _to_numpy(c) g = _to_numpy(g) model.eval() if fast: model.make_generation_fast_() if c is None: assert length is not None else: # (Tc, D) if c.ndim != 2: raise RuntimeError( "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given." .format(hparams.cin_channels, c.shape)) assert c.ndim == 2 Tc = c.shape[0] upsample_factor = audio.get_hop_size() # Overwrite length according to feature size length = Tc * upsample_factor # (Tc, D) -> (Tc', D) # Repeat features before feeding it to the network if not hparams.upsample_conditional_features: c = np.repeat(c, upsample_factor, axis=0) # B x C x T c = torch.FloatTensor(c.T).unsqueeze(0) if initial_value is None: if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels - 1) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) g = None if g is None else torch.LongTensor([g]) # Transform data to GPU initial_input = initial_input.to(device) g = None if g is None else g.to(device) c = None if c is None else c.to(device) with torch.no_grad(): y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() if hparams.postprocess is not None and hparams.postprocess not in [ "", "none" ]: y_hat = getattr(audio, hparams.postprocess)(y_hat) if hparams.global_gain_scale > 0: y_hat /= hparams.global_gain_scale return y_hat