def do_test(self, writer, epoch, step, data_path, test_index): dataset = env.MultispeakerDataset(test_index, data_path) criterion = nn.NLLLoss().cuda() # k = 0 # saved_k = 0 pad_left = self.pad_left() pad_left_encoder = self.pad_left_encoder() pad_left_decoder = self.pad_left_decoder() extra_pad_right = 0 pad_right = self.pad_right() + extra_pad_right window = 16 * self.total_scale() test_loader = DataLoader( dataset, collate_fn=lambda batch: env.collate_multispeaker_samples( pad_left, window, pad_right, batch), batch_size=16, num_workers=2, shuffle=False, pin_memory=True) running_loss_c = 0. running_loss_f = 0. running_loss_vq = 0. running_loss_vqc = 0. running_entropy = 0. running_max_grad = 0. running_max_grad_name = "" for i, (speaker, wave16) in enumerate(test_loader): speaker = speaker.cuda() wave16 = wave16.cuda() coarse = (wave16 + 2**15) // 256 fine = (wave16 + 2**15) % 256 coarse_f = coarse.float() / 127.5 - 1. fine_f = fine.float() / 127.5 - 1. total_f = (wave16.float() + 0.5) / 32767.5 noisy_f = total_f x = torch.cat([ coarse_f[:, pad_left - pad_left_decoder:-pad_right].unsqueeze(-1), fine_f[:, pad_left - pad_left_decoder:-pad_right].unsqueeze(-1), coarse_f[:, pad_left - pad_left_decoder + 1:1 - pad_right].unsqueeze(-1), ], dim=2) y_coarse = coarse[:, pad_left + 1:1 - pad_right] y_fine = fine[:, pad_left + 1:1 - pad_right] translated = noisy_f[:, pad_left - pad_left_encoder:] p_cf, vq_pen, encoder_pen, entropy = self(speaker, x, translated) p_c, p_f = p_cf loss_c = criterion(p_c.transpose(1, 2).float(), y_coarse) loss_f = criterion(p_f.transpose(1, 2).float(), y_fine) # encoder_weight = 0.01 * min(1, max(0.1, step / 1000 - 1)) # loss = loss_c + loss_f + vq_pen + encoder_weight * encoder_pen running_loss_c += loss_c.item() running_loss_f += loss_f.item() running_loss_vq += vq_pen.item() running_loss_vqc += encoder_pen.item() running_entropy += entropy avg_loss_c = running_loss_c / (i + 1) avg_loss_f = running_loss_f / (i + 1) avg_loss_vq = running_loss_vq / (i + 1) avg_loss_vqc = running_loss_vqc / (i + 1) avg_entropy = running_entropy / (i + 1) k = step // 1000 # tensorboard writer writer.add_scalars( 'Test/loss_group', { 'loss_c': avg_loss_c, 'loss_f': avg_loss_f, 'vq': avg_loss_vq, 'vqc': avg_loss_vqc, 'entropy': avg_entropy, }, step - 1)
def do_train(self, paths, dataset, optimiser, epochs, batch_size, step, lr=1e-4, valid_index=[], use_half=False, do_clip=False): if use_half: import apex optimiser = apex.fp16_utils.FP16_Optimizer(optimiser, dynamic_loss_scale=True) for p in optimiser.param_groups: p['lr'] = lr criterion = nn.NLLLoss().cuda() k = 0 saved_k = 0 pad_left = self.pad_left() pad_left_encoder = self.pad_left_encoder() pad_left_decoder = self.pad_left_decoder() if self.noise_x: extra_pad_right = 127 else: extra_pad_right = 0 pad_right = self.pad_right() + extra_pad_right window = 16 * self.total_scale() logger.log( f'pad_left={pad_left_encoder}|{pad_left_decoder}, pad_right={pad_right}, total_scale={self.total_scale()}' ) for e in range(epochs): trn_loader = DataLoader( dataset, collate_fn=lambda batch: env.collate_multispeaker_samples( pad_left, window, pad_right, batch), batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True) start = time.time() running_loss_c = 0. running_loss_f = 0. running_loss_vq = 0. running_loss_vqc = 0. running_entropy = 0. running_max_grad = 0. running_max_grad_name = "" iters = len(trn_loader) for i, (speaker, wave16) in enumerate(trn_loader): speaker = speaker.cuda() wave16 = wave16.cuda() coarse = (wave16 + 2**15) // 256 fine = (wave16 + 2**15) % 256 coarse_f = coarse.float() / 127.5 - 1. fine_f = fine.float() / 127.5 - 1. total_f = (wave16.float() + 0.5) / 32767.5 if self.noise_y: noisy_f = total_f * ( 0.02 * torch.randn(total_f.size(0), 1).cuda() ).exp() + 0.003 * torch.randn_like(total_f) else: noisy_f = total_f if use_half: coarse_f = coarse_f.half() fine_f = fine_f.half() noisy_f = noisy_f.half() x = torch.cat([ coarse_f[:, pad_left - pad_left_decoder:-pad_right].unsqueeze(-1), fine_f[:, pad_left - pad_left_decoder:-pad_right].unsqueeze(-1), coarse_f[:, pad_left - pad_left_decoder + 1:1 - pad_right].unsqueeze(-1), ], dim=2) y_coarse = coarse[:, pad_left + 1:1 - pad_right] y_fine = fine[:, pad_left + 1:1 - pad_right] if self.noise_x: # Randomly translate the input to the encoder to encourage # translational invariance total_len = coarse_f.size(1) translated = [] for j in range(coarse_f.size(0)): shift = random.randrange(256) - 128 translated.append( noisy_f[j, pad_left - pad_left_encoder + shift:total_len - extra_pad_right + shift]) translated = torch.stack(translated, dim=0) else: translated = noisy_f[:, pad_left - pad_left_encoder:] p_cf, vq_pen, encoder_pen, entropy = self( speaker, x, translated) p_c, p_f = p_cf loss_c = criterion(p_c.transpose(1, 2).float(), y_coarse) loss_f = criterion(p_f.transpose(1, 2).float(), y_fine) encoder_weight = 0.01 * min(1, max(0.1, step / 1000 - 1)) loss = loss_c + loss_f + vq_pen + encoder_weight * encoder_pen optimiser.zero_grad() if use_half: optimiser.backward(loss) if do_clip: raise RuntimeError( "clipping in half precision is not implemented yet" ) else: loss.backward() if do_clip: max_grad = 0 max_grad_name = "" for name, param in self.named_parameters(): if param.grad is not None: param_max_grad = param.grad.data.abs().max() if param_max_grad > max_grad: max_grad = param_max_grad max_grad_name = name if 1000000 < param_max_grad: logger.log( f'Very large gradient at {name}: {param_max_grad}' ) if 100 < max_grad: for param in self.parameters(): if param.grad is not None: if 1000000 < max_grad: param.grad.data.zero_() else: param.grad.data.mul_(100 / max_grad) if running_max_grad < max_grad: running_max_grad = max_grad running_max_grad_name = max_grad_name if 100000 < max_grad: torch.save(self.state_dict(), "bad_model.pyt") raise RuntimeError( "Aborting due to crazy gradient (model saved to bad_model.pyt)" ) optimiser.step() running_loss_c += loss_c.item() running_loss_f += loss_f.item() running_loss_vq += vq_pen.item() running_loss_vqc += encoder_pen.item() running_entropy += entropy self.after_update() speed = (i + 1) / (time.time() - start) avg_loss_c = running_loss_c / (i + 1) avg_loss_f = running_loss_f / (i + 1) avg_loss_vq = running_loss_vq / (i + 1) avg_loss_vqc = running_loss_vqc / (i + 1) avg_entropy = running_entropy / (i + 1) step += 1 k = step // 1000 logger.status( f'Epoch: {e+1}/{epochs} -- Batch: {i+1}/{iters} -- Loss: c={avg_loss_c:#.4} f={avg_loss_f:#.4} vq={avg_loss_vq:#.4} vqc={avg_loss_vqc:#.4} -- Entropy: {avg_entropy:#.4} -- Grad: {running_max_grad:#.1} {running_max_grad_name} Speed: {speed:#.4} steps/sec -- Step: {k}k ' ) os.makedirs(paths.checkpoint_dir, exist_ok=True) torch.save(self.state_dict(), paths.model_path()) np.save(paths.step_path(), step) logger.log_current_status() logger.log( f' <saved>; w[0][0] = {self.overtone.wavernn.gru.weight_ih_l0[0][0]}' ) if k > saved_k + 50: torch.save(self.state_dict(), paths.model_hist_path(step)) saved_k = k self.do_generate(paths, step, dataset.path, valid_index)
def do_train(self, paths, dataset, optimiser, writer, epochs, test_epochs, batch_size, step, epoch, valid_index=[], use_half=False, do_clip=False, beta=0.): if use_half: import apex optimiser = apex.fp16_utils.FP16_Optimizer(optimiser, dynamic_loss_scale=True) # for p in optimiser.param_groups : p['lr'] = lr criterion = nn.NLLLoss().cuda() # k = 0 # saved_k = 0 pad_left = self.pad_left() pad_left_encoder = self.pad_left_encoder() pad_left_decoder = self.pad_left_decoder() if self.noise_x: extra_pad_right = 127 else: extra_pad_right = 0 pad_right = self.pad_right() + extra_pad_right window = 16 * self.total_scale() for e in tqdm(range(epoch, epochs), desc="epochs"): trn_loader = DataLoader( dataset, collate_fn=lambda batch: env.collate_multispeaker_samples( pad_left, window, pad_right, batch), batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True) start = time.time() running_loss_c = 0. running_loss_f = 0. running_loss_vq = 0. running_loss_vqc = 0. running_entropy = 0. running_max_grad = 0. running_max_grad_name = "" iters = len(trn_loader) for i, (speaker, wave16) in enumerate(trn_loader): # class MultispeakerDataset(Dataset): # ... # return ""speaker_onehot"", audio speaker = speaker.cuda() wave16 = wave16.cuda() coarse = (wave16 + 2**15) // 256 fine = (wave16 + 2**15) % 256 coarse_f = coarse.float() / 127.5 - 1. fine_f = fine.float() / 127.5 - 1. total_f = (wave16.float() + 0.5) / 32767.5 if self.noise_y: noisy_f = total_f * ( 0.02 * torch.randn(total_f.size(0), 1).cuda() ).exp() + 0.003 * torch.randn_like(total_f) else: noisy_f = total_f if use_half: coarse_f = coarse_f.half() fine_f = fine_f.half() noisy_f = noisy_f.half() x = torch.cat([ coarse_f[:, pad_left - pad_left_decoder:-pad_right].unsqueeze(-1), fine_f[:, pad_left - pad_left_decoder:-pad_right].unsqueeze(-1), coarse_f[:, pad_left - pad_left_decoder + 1:1 - pad_right].unsqueeze(-1), ], dim=2) y_coarse = coarse[:, pad_left + 1:1 - pad_right] y_fine = fine[:, pad_left + 1:1 - pad_right] if self.noise_x: # Randomly translate the input to the encoder to encourage # translational invariance total_len = coarse_f.size(1) translated = [] for j in range(coarse_f.size(0)): shift = random.randrange(256) - 128 translated.append( noisy_f[j, pad_left - pad_left_encoder + shift:total_len - extra_pad_right + shift]) translated = torch.stack(translated, dim=0) else: translated = noisy_f[:, pad_left - pad_left_encoder:] # forward calculation ## def forward(self, global_decoder_cond, x, samples): p_cf, vq_pen, encoder_pen, entropy = self( speaker, x, translated) # loss calculation p_c, p_f = p_cf loss_c = criterion(p_c.transpose(1, 2).float(), y_coarse) loss_f = criterion(p_f.transpose(1, 2).float(), y_fine) encoder_weight = 0.01 * min(1, max(0.1, step / 1000 - 1)) loss = loss_c + loss_f + vq_pen + encoder_weight * encoder_pen # back propagation optimiser.zero_grad() if use_half: optimiser.backward(loss) if do_clip: raise RuntimeError( "clipping in half precision is not implemented yet" ) else: loss.backward() if do_clip: max_grad = 0 max_grad_name = "" for name, param in self.named_parameters(): if param.grad is not None: param_max_grad = param.grad.data.abs().max() if param_max_grad > max_grad: max_grad = param_max_grad max_grad_name = name if 100 < max_grad: for param in self.parameters(): if param.grad is not None: if 1000000 < max_grad: param.grad.data.zero_() else: param.grad.data.mul_(100 / max_grad) if running_max_grad < max_grad: running_max_grad = max_grad running_max_grad_name = max_grad_name if 100000 < max_grad: torch.save(self.state_dict(), "bad_model.pyt") raise RuntimeError( "Aborting due to crazy gradient (model saved to bad_model.pyt)" ) # optimization optimiser.step() # loss logging running_loss_c += loss_c.item() running_loss_f += loss_f.item() running_loss_vq += vq_pen.item() running_loss_vqc += encoder_pen.item() running_entropy += entropy self.after_update() speed = (i + 1) / (time.time() - start) avg_loss_c = running_loss_c / (i + 1) avg_loss_f = running_loss_f / (i + 1) avg_loss_vq = running_loss_vq / (i + 1) avg_loss_vqc = running_loss_vqc / (i + 1) avg_entropy = running_entropy / (i + 1) step += 1 k = step // 1000 # tensorboard writer writer.add_scalars( 'Train/loss_group', { 'loss_c': loss_c.item(), 'loss_f': loss_f.item(), 'vq': vq_pen.item(), 'vqc': encoder_pen.item(), 'entropy': entropy, }, step - 1) os.makedirs(paths.checkpoint_dir, exist_ok=True) torch.save( { 'epoch': e, 'state_dict': self.state_dict(), 'optimiser': optimiser.state_dict(), 'step': step }, paths.model_path()) # torch.save(self.state_dict(), paths.model_path()) # np.save(paths.step_path(), step) if e % test_epochs == 0: torch.save( { 'epoch': e, 'state_dict': self.state_dict(), 'optimiser': optimiser.state_dict(), 'step': step }, paths.model_hist_path(step)) self.do_test(writer, e, step, dataset.path, valid_index) self.do_test_generate(paths, step, dataset.path, valid_index) # finish an epoch print("finish training.")