def build_model(): if is_mulaw_quantize(hparams.input_type): if hparams.out_channels != hparams.quantize_channels: raise RuntimeError( "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'" ) if hparams.upsample_conditional_features and hparams.cin_channels < 0: s = "Upsample conv layers were specified while local conditioning disabled. " s += "Notice that upsample conv layers will never be used." warn(s) model = getattr(builder, hparams.builder)( out_channels=hparams.out_channels, layers=hparams.layers, stacks=hparams.stacks, residual_channels=hparams.residual_channels, gate_channels=hparams.gate_channels, skip_out_channels=hparams.skip_out_channels, cin_channels=hparams.cin_channels, gin_channels=hparams.gin_channels, weight_normalization=hparams.weight_normalization, n_speakers=hparams.n_speakers, dropout=hparams.dropout, kernel_size=hparams.kernel_size, upsample_conditional_features=hparams.upsample_conditional_features, upsample_scales=hparams.upsample_scales, freq_axis_kernel_size=hparams.freq_axis_kernel_size, scalar_input=is_scalar_input(hparams.input_type), legacy=hparams.legacy, ) return model
def save_states(global_step, writer, y_hat, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) idx = np.random.randint(0, len(y_hat)) length = input_lengths[idx].data.cpu().item() # (B, C, T) if y_hat.dim() == 4: y_hat = y_hat.squeeze(-1) if is_mulaw_quantize(hparams.input_type): # (B, T) y_hat = F.softmax(y_hat, dim=1).max(1)[1] # (T,) y_hat = y_hat[idx].data.cpu().long().numpy() y = y[idx].view(-1).data.cpu().long().numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y = P.inv_mulaw_quantize(y, hparams.quantize_channels) else: # (B, T) y_hat = sample_from_discretized_mix_logistic( y_hat, log_scale_min=hparams.log_scale_min) # (T,) y_hat = y_hat[idx].view(-1).data.cpu().numpy() y = y[idx].view(-1).data.cpu().numpy() if is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels) y = P.inv_mulaw(y, hparams.quantize_channels) # Mask by length y_hat[length:] = 0 y[length:] = 0 # Save audio audio_dir = join(checkpoint_dir, "audio") os.makedirs(audio_dir, exist_ok=True) path = join(audio_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y, sr=hparams.sample_rate)
def _process_utterance(wav_path, out_dir): fname = wav_path.split(os.sep)[-1].split(".")[0] audio_filename = '{}_resolved.npy'.format(fname) mel_filename = '{}_mel.npy'.format(fname) apth = os.path.join(out_dir, audio_filename) mpth = os.path.join(out_dir, mel_filename) if os.path.exists(apth) and os.path.exists(mpth): print("File {} already processed".format(wav_path)) return # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: np.save(apth, out.astype(out_dtype), allow_pickle=False) np.save(mpth, mel_spectrogram.astype(np.float32), allow_pickle=False)
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then this is determined by the feature size. c (numpy.ndarray): Conditional features, of shape T x C g (scaler): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray : Generated waveform samples """ from train import sanity_check sanity_check(model, c, g) c = _to_numpy(c) g = _to_numpy(g) model.eval() if fast: model.make_generation_fast_() if c is None: assert length is not None else: # (Tc, D) if c.ndim != 2: raise RuntimeError( "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape)) assert c.ndim == 2 Tc = c.shape[0] upsample_factor = audio.get_hop_size() # Overwrite length according to feature size length = Tc * upsample_factor # (Tc, D) -> (Tc', D) # Repeat features before feeding it to the network if not hparams.upsample_conditional_features: c = np.repeat(c, upsample_factor, axis=0) # B x C x T c = torch.FloatTensor(c.T).unsqueeze(0) if initial_value is None: if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) g = None if g is None else torch.LongTensor([g]) # Transform data to GPU initial_input = initial_input.to(device) g = None if g is None else g.to(device) c = None if c is None else c.to(device) with torch.no_grad(): y_hat = model.incremental_forward( initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() return y_hat
def train_loop(device, model, data_loaders, optimizer, writer, checkpoint_dir=None): if is_mulaw_quantize(hparams.input_type): criterion = MaskedCrossEntropyLoss() else: criterion = DiscretizedMixturelogisticLoss() if hparams.exponential_moving_average: ema = ExponentialMovingAverage(hparams.ema_decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) else: ema = None global global_step, global_epoch, global_test_step while global_epoch < hparams.nepochs: for phase, data_loader in data_loaders.items(): train = (phase == "train") running_loss = 0. test_evaluated = False for step, (x, y, c, g, input_lengths) in tqdm(enumerate(data_loader)): # Whether to save eval (i.e., online decoding) result do_eval = False eval_dir = join(checkpoint_dir, "{}_eval".format(phase)) # Do eval per eval_interval for train if train and global_step > 0 \ and global_step % hparams.train_eval_interval == 0: do_eval = True # Do eval for test # NOTE: Decoding WaveNet is quite time consuming, so # do only once in a single epoch for testset if not train and not test_evaluated \ and global_epoch % hparams.test_eval_epoch_interval == 0: do_eval = True test_evaluated = True if do_eval: print("[{}] Eval at train step {}".format( phase, global_step)) # Do step running_loss += __train_step(device, phase, global_epoch, global_step, global_test_step, model, optimizer, writer, criterion, x, y, c, g, input_lengths, checkpoint_dir, eval_dir, do_eval, ema) # update global state if train: global_step += 1 else: global_test_step += 1 # log per epoch averaged_loss = running_loss / len(data_loader) writer.add_scalar("{} loss (per epoch)".format(phase), averaged_loss, global_epoch) print("Step {} [{}] Loss: {}".format( global_step, phase, running_loss / len(data_loader))) global_epoch += 1
def __train_step(device, phase, epoch, global_step, global_test_step, model, optimizer, writer, criterion, x, y, c, g, input_lengths, checkpoint_dir, eval_dir=None, do_eval=False, ema=None): sanity_check(model, c, g) # x : (B, C, T) # y : (B, T, 1) # c : (B, C, T) # g : (B,) train = (phase == "train") clip_thresh = hparams.clip_thresh if train: model.train() step = global_step else: model.eval() step = global_test_step # Learning rate schedule current_lr = hparams.initial_learning_rate if train and hparams.lr_schedule is not None: lr_schedule_f = getattr(lrschedule, hparams.lr_schedule) current_lr = lr_schedule_f(hparams.initial_learning_rate, step, **hparams.lr_schedule_kwargs) for param_group in optimizer.param_groups: param_group['lr'] = current_lr optimizer.zero_grad() # Prepare data x, y = x.to(device), y.to(device) input_lengths = input_lengths.to(device) c = c.to(device) if c is not None else None g = g.to(device) if g is not None else None # (B, T, 1) mask = sequence_mask(input_lengths, max_len=x.size(-1)).unsqueeze(-1) mask = mask[:, 1:, :] # Apply model: Run the model in regular eval mode # NOTE: softmax is handled in F.cross_entrypy_loss # y_hat: (B x C x T) if use_cuda: # multi gpu support # you must make sure that batch size % num gpu == 0 y_hat = torch.nn.parallel.data_parallel(model, (x, c, g, False)) else: y_hat = model(x, c, g, False) if is_mulaw_quantize(hparams.input_type): # wee need 4d inputs for spatial cross entropy loss # (B, C, T, 1) y_hat = y_hat.unsqueeze(-1) loss = criterion(y_hat[:, :, :-1, :], y[:, 1:, :], mask=mask) else: loss = criterion(y_hat[:, :, :-1], y[:, 1:, :], mask=mask) if train and step > 0 and step % hparams.checkpoint_interval == 0: save_states(step, writer, y_hat, y, input_lengths, checkpoint_dir) save_checkpoint(device, model, optimizer, step, checkpoint_dir, epoch, ema) if do_eval: # NOTE: use train step (i.e., global_step) for filename eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema) # Update if train: loss.backward() if clip_thresh > 0: grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_thresh) optimizer.step() # update moving average if ema is not None: for name, param in model.named_parameters(): if name in ema.shadow: ema.update(name, param.data) # Logs writer.add_scalar("{} loss".format(phase), float(loss.item()), step) if train: if clip_thresh > 0: writer.add_scalar("gradient norm", grad_norm, step) writer.add_scalar("learning rate", current_lr, step) return loss.item()
def eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(device, model, ema) model.make_generation_fast_() model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().item() # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: if hparams.upsample_conditional_features: c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0) else: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 print("Intial value:", initial_value) # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) initial_input = initial_input.to(device) # Run the model in fast eval mode with torch.no_grad(): y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def collate_fn(batch): """Create batch Args: batch(tuple): List of tuples - x[0] (ndarray,int) : list of (T,) - x[1] (ndarray,int) : list of (T, D) - x[2] (ndarray,int) : list of (1,), speaker id Returns: tuple: Tuple of batch - x (FloatTensor) : Network inputs (B, C, T) - y (LongTensor) : Network targets (B, T, 1) """ local_conditioning = len(batch[0]) >= 2 and hparams.cin_channels > 0 global_conditioning = len(batch[0]) >= 3 and hparams.gin_channels > 0 if hparams.max_time_sec is not None: max_time_steps = int(hparams.max_time_sec * hparams.sample_rate) elif hparams.max_time_steps is not None: max_time_steps = hparams.max_time_steps else: max_time_steps = None # Time resolution adjustment if local_conditioning: new_batch = [] for idx in range(len(batch)): x, c, g = batch[idx] if hparams.upsample_conditional_features: assert_ready_for_upsampling(x, c) if max_time_steps is not None: max_steps = ensure_divisible(max_time_steps, audio.get_hop_size(), True) if len(x) > max_steps: max_time_frames = max_steps // audio.get_hop_size() s = np.random.randint(0, len(c) - max_time_frames) ts = s * audio.get_hop_size() x = x[ts:ts + audio.get_hop_size() * max_time_frames] c = c[s:s + max_time_frames, :] assert_ready_for_upsampling(x, c) else: x, c = audio.adjust_time_resolution(x, c) if max_time_steps is not None and len(x) > max_time_steps: s = np.random.randint(0, len(x) - max_time_steps) x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :] assert len(x) == len(c) new_batch.append((x, c, g)) batch = new_batch else: new_batch = [] for idx in range(len(batch)): x, c, g = batch[idx] x = audio.trim(x) if max_time_steps is not None and len(x) > max_time_steps: s = np.random.randint(0, len(x) - max_time_steps) if local_conditioning: x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :] else: x = x[s:s + max_time_steps] new_batch.append((x, c, g)) batch = new_batch # Lengths input_lengths = [len(x[0]) for x in batch] max_input_len = max(input_lengths) # (B, T, C) # pad for time-axis if is_mulaw_quantize(hparams.input_type): padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels) x_batch = np.array([ _pad_2d( np_utils.to_categorical(x[0], num_classes=hparams.quantize_channels), max_input_len, padding_value) for x in batch ], dtype=np.float32) else: x_batch = np.array( [_pad_2d(x[0].reshape(-1, 1), max_input_len) for x in batch], dtype=np.float32) assert len(x_batch.shape) == 3 # (B, T) if is_mulaw_quantize(hparams.input_type): padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels) y_batch = np.array([ _pad(x[0], max_input_len, constant_values=padding_value) for x in batch ], dtype=np.int) else: y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.float32) assert len(y_batch.shape) == 2 # (B, T, D) if local_conditioning: max_len = max([len(x[1]) for x in batch]) c_batch = np.array([_pad_2d(x[1], max_len) for x in batch], dtype=np.float32) assert len(c_batch.shape) == 3 # (B x C x T) c_batch = torch.FloatTensor(c_batch).transpose(1, 2).contiguous() else: c_batch = None if global_conditioning: g_batch = torch.LongTensor([x[2] for x in batch]) else: g_batch = None # Covnert to channel first i.e., (B, C, T) x_batch = torch.FloatTensor(x_batch).transpose(1, 2).contiguous() # Add extra axis if is_mulaw_quantize(hparams.input_type): y_batch = torch.LongTensor(y_batch).unsqueeze(-1).contiguous() else: y_batch = torch.FloatTensor(y_batch).unsqueeze(-1).contiguous() input_lengths = torch.LongTensor(input_lengths) return x_batch, y_batch, c_batch, g_batch, input_lengths