def test_train_step(self): input = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()):, 0] = 1.0 stop_targets = stop_targets.view(input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron( num_chars=32, num_speakers=5, linear_dim=c.audio['num_freq'], mel_dim=c.audio['num_mels'], r=c.r, memory_size=c.memory_size ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): mel_out, linear_out, align, stop_tokens = model.forward( input, input_lengths, mel_spec, speaker_ids) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: assert (param != param_ref).any( ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) count += 1
def test_train_step(self): input = torch.randint(0, 24, (8, 128)).long().to(device) mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()):, 0] = 1.0 stop_targets = stop_targets.view(input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float() criterion = L1LossMasked().to(device) criterion_st = nn.BCELoss().to(device) model = Tacotron(c.embedding_size, c.audio['num_freq'], c.audio['num_mels'], c.r).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): mel_out, linear_out, align, stop_tokens = model.forward( input, mel_spec) assert stop_tokens.data.max() <= 1.0 assert stop_tokens.data.min() >= 0.0 optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: assert (param != param_ref).any( ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) count += 1
def create_align_features(model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path: Path): assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \ f'Reduction factor was: {model.r}' model.eval() device = next( model.parameters()).device # use same device as model parameters iters = len(val_set) + len(train_set) dataset = itertools.chain(train_set, val_set) for i, (x, mels, ids, mel_lens) in enumerate(dataset, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, _, attn = model(x, mels) attn = np_now(attn) bs, chars = attn.shape[0], attn.shape[2] argmax = np.argmax(attn[:, :, :], axis=2) mel_counts = np.zeros(shape=(bs, chars), dtype=np.int32) for b in range(attn.shape[0]): # fix random jumps in attention for j in range(1, argmax.shape[1]): if abs(argmax[b, j] - argmax[b, j - 1]) > 10: argmax[b, j] = argmax[b, j - 1] count = np.bincount(argmax[b, :mel_lens[b]]) mel_counts[b, :len(count)] = count[:len(count)] for j, item_id in enumerate(ids): np.save(str(save_path / f'{item_id}.npy'), mel_counts[j, :], allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def create_attn_ref(model: Tacotron, train_set, save_path: Path): # import pdb; pdb.set_trace() save_path.mkdir(parents=False, exist_ok=True) device = next( model.parameters()).device # use same device as model parameters iters = len(train_set) for i, (x, mels, ids, mel_lens) in enumerate(train_set, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, m2_hat, attn_ref = model(x, mels) # print(x.size()) # print(mels.size()) # print(m2_hat.size()) # print(attn_ref.size()) # print(mel_lens) # pdb.set_trace() attn_ref = attn_ref.cpu().numpy() for j, item_id in enumerate(ids): # attn_ref_tmp = attn_ref[j][:mel_lens[j]//model.r, :] attn_ref_tmp = attn_ref[j][:, :] np.save(save_path / f'{item_id}.npy', attn_ref_tmp, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def create_gta_features(model: Tacotron, train_set, save_path: Path): device = next( model.parameters()).device # use same device as model parameters iters = len(train_set) for i, (x, mels, ids, mel_lens) in enumerate(train_set, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, gta, _ = model(x, mels) gta = gta.cpu().numpy() for j, item_id in enumerate(ids): mel = gta[j][:, :mel_lens[j]] mel = (mel + 4) / 8 np.save(save_path / '%s.npy' % (repr1(item_id)), mel, allow_pickle=False) bar = progbar(i, iters) msg = '%s %s/%s Batches ' % (repr1(bar), repr1(i), repr1(iters)) stream(msg)
def generate_plots(self, model: Tacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, x_lens, m_lens = session.val_sample x, m = x.to(device), m.to(device) m1_hat, m2_hat, att = model(x, m) att = np_now(att)[0] m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, att = model.generate(x[0].tolist(), steps=m_lens[0] + 20) att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/attention', att_fig, model.step) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def generate_plots(self, model: Tacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device batch = session.val_sample batch = to_device(batch, device=device) m1_hat, m2_hat, att = model(batch['x'], batch['mel']) att = np_now(att)[0] m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m_target = np_now(batch['mel'])[0, :600, :] att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_target_fig = plot_mel(m_target) self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_target_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) target_wav = self.dsp.griffinlim(m_target) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) m1_hat, m2_hat, att = model.generate(batch['x'][0:1], steps=batch['mel_len'][0] + 20) att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/attention', att_fig, model.step) self.writer.add_figure('Generated/target', m_target_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate)
def create_align_features( model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path_alg: Path, # save_path_pitch: Path ): assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \ f'Reduction factor was: {model.r}' model.eval() device = next( model.parameters()).device # use same device as model parameters if val_set is not None: iters = len(val_set) + len(train_set) dataset = itertools.chain(train_set, val_set) else: # print('here') iters = len(train_set) # print(iters) dataset = itertools.chain(train_set) att_score_dict = {} if hp.extract_durations_with_dijkstra: print('Extracting durations using dijkstra...') dur_extraction_func = extract_durations_with_dijkstra else: print('Extracting durations using attention peak counts...') dur_extraction_func = extract_durations_per_count # for i in dataset: # print(i) for i, (x, mels, ids, x_lens, mel_lens) in enumerate(dataset, 1): x, mels = x.to(device), mels.to(device) # print(x) # print(mels) with torch.no_grad(): _, _, att_batch = model(x, mels) align_score, sharp_score = attention_score(att_batch, mel_lens, r=1) att_batch = np_now(att_batch) seq, att, mel_len, item_id = x[0], att_batch[0], mel_lens[0], ids[0] align_score, sharp_score = float(align_score[0]), float(sharp_score[0]) att_score_dict[item_id] = (align_score, sharp_score) durs = dur_extraction_func(seq, att, mel_len) if np.sum(durs) != mel_len: print( f'WARNINNG: Sum of durations did not match mel length for item {item_id}!' ) np.save(str(save_path_alg / f'{item_id}.npy'), durs, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg) pickle_binary(att_score_dict, paths.data / 'att_score_dict.pkl')
def evaluate(self, model: Tacotron, val_set: Dataset) -> float: model.eval() val_loss = 0 device = next(model.parameters()).device for i, (x, m, ids, _) in enumerate(val_set, 1): x, m = x.to(device), m.to(device) with torch.no_grad(): m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) val_loss += m1_loss.item() + m2_loss.item() return val_loss / len(val_set)
def evaluate(self, model: Tacotron, val_set: Dataset) -> Tuple[float, float]: model.eval() val_loss = 0 val_att_score = 0 device = next(model.parameters()).device for i, batch in enumerate(val_set, 1): batch = to_device(batch, device=device) with torch.no_grad(): m1_hat, m2_hat, attention = model(batch['x'], batch['mel']) m1_loss = F.l1_loss(m1_hat, batch['mel']) m2_loss = F.l1_loss(m2_hat, batch['mel']) val_loss += m1_loss.item() + m2_loss.item() _, att_score = attention_score(attention, batch['mel_len']) val_att_score += torch.mean(att_score).item() return val_loss / len(val_set), val_att_score / len(val_set)
def evaluate(self, model: Tacotron, val_set: Dataset) -> Tuple[float, float]: model.eval() val_loss = 0 val_att_score = 0 device = next(model.parameters()).device for i, (x, m, ids, x_lens, mel_lens) in enumerate(val_set, 1): x, m = x.to(device), m.to(device) with torch.no_grad(): m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) val_loss += m1_loss.item() + m2_loss.item() _, att_score = attention_score(attention, mel_lens) val_att_score += torch.mean(att_score).item() return val_loss / len(val_set), val_att_score / len(val_set)
def create_gta_features(model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path: Path): model.eval() device = next( model.parameters()).device # use same device as model parameters iters = len(train_set) + len(val_set) dataset = itertools.chain(train_set, val_set) for i, (x, mels, ids, mel_lens, dur) in enumerate(dataset, 1): x, mels, dur = x.to(device), mels.to(device), dur.to(device) with torch.no_grad(): _, gta, _ = model(x, mels, dur) gta = gta.cpu().numpy() for j, item_id in enumerate(ids): mel = gta[j][:, :mel_lens[j]] np.save(str(save_path / f'{item_id}.npy'), mel, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def create_gta_features(model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path: Path) -> None: model.eval() device = next( model.parameters()).device # use same device as model parameters iters = len(train_set) + len(val_set) dataset = itertools.chain(train_set, val_set) for i, batch in enumerate(dataset, 1): batch = to_device(batch, device=device) with torch.no_grad(): pred = model(batch) gta = pred['mel_post'].cpu().numpy() for j, item_id in enumerate(batch['item_id']): mel = gta[j][:, :batch['mel_len'][j]] np.save(str(save_path / f'{item_id}.npy'), mel, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def create_align_features(model: Tacotron, train_set, save_path: Path): assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \ f'Reduction factor was: {model.r}' device = next(model.parameters()).device # use same device as model parameters iters = len(train_set) for i, (x, mels, ids, mel_lens, _) in enumerate(train_set, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, _, attn = model(x, mels) attn = np_now(attn) bs, chars = attn.shape[0], attn.shape[2] argmax = np.argmax(attn[:, :, :], axis=2) mel_counts = np.zeros(shape=(bs, chars), dtype=np.int32) for b in range(attn.shape[0]): count = np.bincount(argmax[b, :]) mel_counts[b, :len(count)] = count[:len(count)] for j, item_id in enumerate(ids): np.save(save_path / f'{item_id}.npy', mel_counts[j, :], allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def create_gta_features(model: Tacotron, train_set, save_path: Path): save_path.mkdir(parents=False, exist_ok=True) device = next( model.parameters()).device # use same device as model parameters iters = len(train_set) for i, (x, mels, ids, mel_lens) in enumerate(train_set, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, gta, _ = model(x, mels) gta = gta.cpu().numpy() for j, item_id in enumerate(ids): mel = gta[j][:, :mel_lens[j]] mel = (mel + 4) / 8 np.save(save_path / f'{item_id}.npy', mel, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def main(args): dataset = importlib.import_module('datasets.' + c.dataset) Dataset = getattr(dataset, 'MyDataset') audio = importlib.import_module('utils.' + c.audio_processor) AudioProcessor = getattr(audio, 'AudioProcessor') ap = AudioProcessor(sample_rate=c.sample_rate, num_mels=c.num_mels, min_level_db=c.min_level_db, frame_shift_ms=c.frame_shift_ms, frame_length_ms=c.frame_length_ms, ref_level_db=c.ref_level_db, num_freq=c.num_freq, power=c.power, preemphasis=c.preemphasis) # Setup the dataset train_dataset = Dataset(c.data_path, c.meta_file_train, c.r, c.text_cleaner, ap=ap, min_seq_len=c.min_seq_len) train_loader = DataLoader(train_dataset, batch_size=c.batch_size, shuffle=False, collate_fn=train_dataset.collate_fn, drop_last=False, num_workers=c.num_loader_workers, pin_memory=True) if c.run_eval: val_dataset = Dataset(c.data_path, c.meta_file_val, c.r, c.text_cleaner, ap=ap) val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size, shuffle=False, collate_fn=val_dataset.collate_fn, drop_last=False, num_workers=4, pin_memory=True) else: val_loader = None model = Tacotron(c.embedding_size, ap.num_freq, c.num_mels, c.r) print(" | > Num output units : {}".format(ap.num_freq), flush=True) optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr) criterion = L1LossMasked() criterion_st = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) model.load_state_dict(checkpoint['model']) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() optimizer.load_state_dict(checkpoint['optimizer']) # optimizer_st.load_state_dict(checkpoint['optimizer_st']) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() print(" > Model restored from step %d" % checkpoint['step'], flush=True) start_epoch = checkpoint['step'] // len(train_loader) best_loss = checkpoint['linear_loss'] args.restore_step = checkpoint['step'] else: args.restore_step = 0 print("\n > Starting a new training", flush=True) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() scheduler = AnnealLR(optimizer, warmup_steps=c.warmup_steps) num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params), flush=True) if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train(model, criterion, criterion_st, train_loader, optimizer, optimizer_st, scheduler, ap, epoch) val_loss = evaluate(model, criterion, criterion_st, val_loader, ap, current_step) print(" | > Train Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) best_loss = save_best_model(model, optimizer, train_loss, best_loss, OUT_PATH, current_step, epoch)
def main(args): # Setup the dataset train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'train_metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power, min_seq_len=c.min_seq_len ) train_loader = DataLoader(train_dataset, batch_size=c.batch_size, shuffle=False, collate_fn=train_dataset.collate_fn, drop_last=False, num_workers=c.num_loader_workers, pin_memory=True) ''' val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'valid_metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power ) val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size, shuffle=False, collate_fn=val_dataset.collate_fn, drop_last=False, num_workers=4, pin_memory=True) ''' model = Tacotron(c.embedding_size, c.num_freq, c.num_mels, c.r) optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr) criterion = L1LossMasked() criterion_st = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) model.load_state_dict(checkpoint['model']) optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer.load_state_dict(checkpoint['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() print(" > Model restored from step %d" % checkpoint['step']) start_epoch = checkpoint['step'] // len(train_loader) best_loss = checkpoint['linear_loss'] start_epoch = 0 args.restore_step = checkpoint['step'] optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr) else: args.restore_step = 0 print("\n > Starting a new training") if use_cuda: model = nn.DataParallel(model.cuda()) criterion.cuda() criterion_st.cuda() num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params)) if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train( model, criterion, criterion_st, train_loader, optimizer, optimizer_st, epoch) #val_loss = evaluate(model, criterion, criterion_st, val_loader, current_step) best_loss = save_best_model(model, optimizer, train_loss, best_loss, OUT_PATH, current_step, epoch)
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_train = args.force_train force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.tts_schedule: _, _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Tacotron Model print('\nInitialising Tacotron Model...\n') model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.tts_schedule): current_step = model.get_step() r, lr, max_step, batch_size = session training_steps = max_step - current_step # Do we need to change to the next session? if current_step >= max_step: # Are there no further sessions than the current one? if i == len(hp.tts_schedule) - 1: # There are no more sessions. Check if we force training. if force_train: # Don't finish the loop - train forever training_steps = 999_999_999 else: # We have completed training. Breaking is same as continue break else: # There is a following session, go to it continue model.r = r simple_table([('Steps with r=%s' % (repr1(r)), str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr), ('Outputs/Step (r)', model.r)]) train_set, attn_example = get_tts_datasets(paths.data, batch_size, r) tts_train_loop(paths, model, optimizer, train_set, lr, training_steps, attn_example) print('Training Complete.') print( 'To continue training increase tts_total_steps in hparams.py or use --force_train\n' ) print('Creating Ground Truth Aligned Dataset...\n') train_set, attn_example = get_tts_datasets(paths.data, 8, model.r) create_gta_features(model, train_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' )
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = 'taco_step%sK' % (repr1(k)) save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / '%s' % (repr1(step))) save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / '%s' % (repr1(step)), 600) msg = '| Epoch: %s/%s (%s/%s) | Loss: %.4f | %.2f steps/s | Step: %sk | ' % ( repr1(e), repr1(epochs), repr1(i), repr1(total_iters), avg_loss, speed, repr1(k)) stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def train_session(self, model: Tacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 model.r = session.r simple_table([(f'Steps with r={session.r}', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr), ('Outputs/Step (r)', model.r)]) for g in optimizer.param_groups: g['lr'] = session.lr loss_avg = Averager() duration_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, (x, m, ids, x_lens, mel_lens) in enumerate(session.train_set, 1): start = time.time() model.train() x, m = x.to(device), m.to(device) m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm) optimizer.step() loss_avg.add(loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', self.paths, model, optimizer, name=ckpt_name, is_silent=True) if step % hp.tts_plot_every == 0: self.generate_plots(model, session) _, att_score = attention_score(attention, mel_lens) att_score = torch.mean(att_score) self.writer.add_scalar('Attention_Score/train', att_score, model.get_step()) self.writer.add_scalar('Loss/train', loss, model.get_step()) self.writer.add_scalar('Params/reduction_factor', session.r, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_loss, val_att_score = self.evaluate(model, session.val_set) self.writer.add_scalar('Loss/val', val_loss, model.get_step()) self.writer.add_scalar('Attention_Score/val', val_att_score, model.get_step()) save_checkpoint('tts', self.paths, model, optimizer, is_silent=True) loss_avg.reset() duration_avg.reset() print(' ')
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _, att_guides) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention, r = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention, r = model(x, m) #print(att_guides.shape) orig_attention = attention n = int(len(att_guides[0]) / r) #print("n", n) #reduce guide by r factor ga = [a[t] for a in att_guides for t in range(0, len(a), r)] assert n == len(attention[0]) guided_attention = [ga[k:k + n] for k in range(0, len(ga), n)] attention = np_now(attention) attention = [ pad2d_nonzero(x, n, len(att_guides[0][0])) for x in attention ] guided_attention = torch.tensor(guided_attention) guided_attention = guided_attention.to(device) attention = torch.tensor(attention) attention = attention.to(device) #create attention mask attention_masks = torch.ne(attention, -1).type(torch.FloatTensor) attention_masks = torch.tensor(attention_masks) attention_masks = attention.to(device) multiply = torch.abs( attention * guided_attention) * attention_masks attention_loss = torch.sum(multiply) mask_sum = torch.sum(attention_masks) attention_loss /= mask_sum m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) #print("mask sum", mask_sum) #print("attention loss", attention_loss) #print("m losses", m1_loss, m2_loss) prev_loss = m1_loss + m2_loss #print("prev loss", prev_loss) loss = m1_loss + m2_loss + attention_loss #print("loss + att", loss) optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(orig_attention[idx][:, :160]), paths.tts_attention / f'{step}') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def main(args): model = Tacotron(c.embedding_size, ap.num_freq, ap.num_mels, c.r) print(" | > Num output units : {}".format(ap.num_freq), flush=True) optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0) optimizer_st = optim.Adam( model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) criterion = L1LossMasked() criterion_st = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) try: model.load_state_dict(checkpoint['model']) except: model_dict = model.state_dict() # Partial initialization: if there is a mismatch with new and old layer, it is skipped. # 1. filter out unnecessary keys pretrained_dict = { k: v for k, v in checkpoint['model'].items() if k in model_dict } # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() optimizer.load_state_dict(checkpoint['optimizer']) print( " > Model restored from step %d" % checkpoint['step'], flush=True) start_epoch = checkpoint['epoch'] best_loss = checkpoint['linear_loss'] args.restore_step = checkpoint['step'] else: args.restore_step = 0 print("\n > Starting a new training", flush=True) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() if c.lr_decay: scheduler = NoamLR( optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params), flush=True) if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch) val_loss = evaluate(model, criterion, criterion_st, ap, current_step) print( " | > Train Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) best_loss = save_best_model(model, optimizer, train_loss, best_loss, OUT_PATH, current_step, epoch)
def main(args): # Setup the dataset train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power ) train_loader = DataLoader(train_dataset, batch_size=c.batch_size, shuffle=False, collate_fn=train_dataset.collate_fn, drop_last=False, num_workers=c.num_loader_workers, pin_memory=True) val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power ) val_loader = DataLoader(val_dataset, batch_size=c.batch_size, shuffle=False, collate_fn=val_dataset.collate_fn, drop_last=False, num_workers= 4, pin_memory=True) model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, c.r, use_atten_mask=True) optimizer = optim.Adam(model.parameters(), lr=c.lr) if use_cuda: criterion = nn.L1Loss().cuda() else: criterion = nn.L1Loss() if args.restore_path: checkpoint = torch.load(args.restore_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n > Model restored from step %d\n" % checkpoint['step']) start_epoch = checkpoint['step'] // len(train_loader) best_loss = checkpoint['linear_loss'] start_epoch = 0 args.restore_step = checkpoint['step'] else: args.restore_step = 0 print("\n > Starting a new training") if use_cuda: model = nn.DataParallel(model.cuda()) num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params)) if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train(model, criterion, train_loader, optimizer, epoch) val_loss = evaluate(model, criterion, val_loader, current_step) best_loss = save_best_model(model, optimizer, val_loss, best_loss, OUT_PATH, current_step, epoch)
print('\nInitialising Tacotron Model...\n') model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(phonemes), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'Num Params: {params}') optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True, device=device) if args.force_gta: print('Creating Ground Truth Aligned Dataset...\n') train_set, val_set = get_tts_datasets(paths.data, 8, model.r) create_gta_features(model, train_set, val_set, paths.gta) print(
def main(args): # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) model = Tacotron(num_chars=num_chars, embedding_dim=c.embedding_size, linear_dim=ap.num_freq, mel_dim=ap.num_mels, r=c.r, memory_size=c.memory_size) optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0) optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) criterion = L1LossMasked() criterion_st = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) try: model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) except: print(" > Partial model initialization.") partial_init_flag = True model_dict = model.state_dict() # Partial initialization: if there is a mismatch with new and old layer, it is skipped. # 1. filter out unnecessary keys pretrained_dict = { k: v for k, v in checkpoint['model'].items() if k in model_dict } # 2. filter out different size layers pretrained_dict = { k: v for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel() } # 3. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 4. load the new state dict model.load_state_dict(model_dict) print(" | > {} / {} layers are initialized".format( len(pretrained_dict), len(model_dict))) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) start_epoch = checkpoint['epoch'] best_loss = checkpoint['linear_loss'] args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch) val_loss = evaluate(model, criterion, criterion_st, ap, current_step, epoch) print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) target_loss = train_loss if c.run_eval: target_loss = val_loss best_loss = save_best_model(model, optimizer, target_loss, best_loss, OUT_PATH, current_step, epoch)
def tts_train_loop_af_offline(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example, hp=None): # setattr(model, 'mode', 'attention_forcing') # import pdb def smooth(d, eps=float(1e-10)): u = 1.0 / float(d.size()[2]) return eps * u + (1 - eps) * d device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss_out, running_loss_attn = 0, 0 # Perform 1 epoch for i, (x, m, ids, _, attn_ref) in enumerate(train_set, 1): # print(x.size()) # print(m.size()) # print(attn_ref.size()) # # print(m1_hat.size(), m2_hat.size()) # # print(attention.size(), attention.size(1)*model.r) # pdb.set_trace() x, m, attn_ref = x.to(device), m.to(device), attn_ref.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention = data_parallel_workaround( model, x, m, False, attn_ref) else: m1_hat, m2_hat, attention = model(x, m, generate_gta=False, attn_ref=attn_ref) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) # attn_loss = F.kl_div(torch.log(smooth(attention)), smooth(attn_ref), reduction='mean') # 'batchmean' attn_loss = F.l1_loss(smooth(attention), smooth(attn_ref)) loss_out = m1_loss + m2_loss loss_attn = attn_loss * hp.attn_loss_coeff loss = loss_out + loss_attn optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss_out += loss_out.item() avg_loss_out = running_loss_out / i running_loss_attn += loss_attn.item() avg_loss_attn = running_loss_attn / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attn_ref[idx][:, :160]), paths.tts_attention / f'{step}_tf') save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / f'{step}_af') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss_out: {avg_loss_out:#.4}; Output_attn: {avg_loss_attn:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout).cuda() paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) model.restore(paths.tts_latest_weights) # model.reset_step() # model.set_r(hp.tts_r) optimiser = optim.Adam(model.parameters()) current_step = model.get_step() if not force_gta: for session in hp.tts_schedule: r, lr, max_step, batch_size = session if current_step < max_step: train_set, attn_example = get_tts_dataset( paths.data, batch_size, r) model.set_r(r)
def main(args): # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = os.path.join(_, c.output_path) OUT_PATH = create_experiment_folder(OUT_PATH) CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) # Ctrl+C handler to remove empty experiment folder def signal_handler(signal, frame): print(" !! Pressed Ctrl+C !!") remove_experiment_folder(OUT_PATH) sys.exit(0) signal.signal(signal.SIGINT, signal_handler) dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner) model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, c.r) if use_cuda: model = nn.DataParallel(model.cuda()) optimizer = optim.Adam(model.parameters(), lr=c.lr) try: checkpoint = torch.load( os.path.join(CHECKPOINT_PATH, 'checkpoint_%d.pth.tar' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n > Model restored from step %d\n" % args.restore_step) except: print("\n > Starting a new training\n") model = model.train() if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if use_cuda: criterion = nn.L1Loss().cuda() else: criterion = nn.L1Loss() n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) for epoch in range(c.epochs): dataloader = DataLoader(dataset, batch_size=c.batch_size, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=32) progbar = Progbar(len(dataset) / c.batch_size) for i, data in enumerate(dataloader): text_input = data[0] magnitude_input = data[1] mel_input = data[2] current_step = i + args.restore_step + epoch * len(dataloader) + 1 optimizer.zero_grad() try: mel_input = np.concatenate( (np.zeros([c.batch_size, 1, c.num_mels], dtype=np.float32), mel_input[:, 1:, :]), axis=1) except: raise TypeError("not same dimension") if use_cuda: text_input_var = Variable(torch.from_numpy(text_input).type( torch.cuda.LongTensor), requires_grad=False).cuda() mel_input_var = Variable(torch.from_numpy(mel_input).type( torch.cuda.FloatTensor), requires_grad=False).cuda() mel_spec_var = Variable(torch.from_numpy(mel_input).type( torch.cuda.FloatTensor), requires_grad=False).cuda() linear_spec_var = Variable( torch.from_numpy(magnitude_input).type( torch.cuda.FloatTensor), requires_grad=False).cuda() else: text_input_var = Variable(torch.from_numpy(text_input).type( torch.LongTensor), requires_grad=False) mel_input_var = Variable(torch.from_numpy(mel_input).type( torch.FloatTensor), requires_grad=False) mel_spec_var = Variable(torch.from_numpy(mel_input).type( torch.FloatTensor), requires_grad=False) linear_spec_var = Variable( torch.from_numpy(magnitude_input).type(torch.FloatTensor), requires_grad=False) mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_input_var) mel_loss = criterion(mel_output, mel_spec_var) linear_loss = torch.abs(linear_output - linear_spec_var) linear_loss = 0.5 * \ torch.mean(linear_loss) + 0.5 * \ torch.mean(linear_loss[:, :n_priority_freq, :]) loss = mel_loss + linear_loss loss = loss.cuda() start_time = time.time() loss.backward() nn.utils.clip_grad_norm(model.parameters(), 1.) optimizer.step() time_per_step = time.time() - start_time progbar.update(i, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0])]) if current_step % c.save_step == 0: checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step) checkpoint_path = os.path.join(OUT_PATH, checkpoint_path) save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'step': current_step, 'total_loss': loss.data[0], 'linear_loss': linear_loss.data[0], 'mel_loss': mel_loss.data[0], 'date': datetime.date.today().strftime("%B %d, %Y") }, checkpoint_path) print(" > Checkpoint is saved : {}".format(checkpoint_path)) if current_step in c.decay_step: optimizer = adjust_learning_rate(optimizer, current_step)
def main(args): # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = os.path.join(_, c.output_path) OUT_PATH = create_experiment_folder(OUT_PATH) CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) # save config to tmp place to be loaded by subsequent modules. file_name = str(os.getpid()) tmp_path = os.path.join("/tmp/", file_name+'_tts') pickle.dump(c, open(tmp_path, "wb")) # setup tensorboard LOG_DIR = OUT_PATH tb = SummaryWriter(LOG_DIR) # Ctrl+C handler to remove empty experiment folder def signal_handler(signal, frame): print(" !! Pressed Ctrl+C !!") remove_experiment_folder(OUT_PATH) sys.exit(1) signal.signal(signal.SIGINT, signal_handler) # Setup the dataset dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power ) dataloader = DataLoader(dataset, batch_size=c.batch_size, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) # setup the model model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, c.r) # plot model on tensorboard dummy_input = dataset.get_dummy_data() ## TODO: onnx does not support RNN fully yet # model_proto_path = os.path.join(OUT_PATH, "model.proto") # onnx.export(model, dummy_input, model_proto_path, verbose=True) # tb.add_graph_onnx(model_proto_path) if use_cuda: model = nn.DataParallel(model.cuda()) optimizer = optim.Adam(model.parameters(), lr=c.lr) if args.restore_step: checkpoint = torch.load(os.path.join( args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n > Model restored from step %d\n" % args.restore_step) start_epoch = checkpoint['step'] // len(dataloader) best_loss = checkpoint['linear_loss'] else: start_epoch = 0 print("\n > Starting a new training") num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params)) model = model.train() if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if use_cuda: criterion = nn.L1Loss().cuda() else: criterion = nn.L1Loss() n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay, # patience=c.lr_patience, verbose=True) epoch_time = 0 best_loss = float('inf') for epoch in range(0, c.epochs): print("\n | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(dataset) / c.batch_size) for num_iter, data in enumerate(dataloader): start_time = time.time() text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1 # setup lr current_lr = lr_decay(c.lr, current_step) for params_group in optimizer.param_groups: params_group['lr'] = current_lr optimizer.zero_grad() # Add a single frame of zeros to Mel Specs for better end detection #try: # mel_input = np.concatenate((np.zeros( # [c.batch_size, 1, c.num_mels], dtype=np.float32), # mel_input[:, 1:, :]), axis=1) #except: # raise TypeError("not same dimension") # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) linear_spec_var = Variable(linear_input, volatile=True) # sort sequence by length. # TODO: might be unnecessary sorted_lengths, indices = torch.sort( text_lengths.view(-1), dim=0, descending=True) sorted_lengths = sorted_lengths.long().numpy() text_input_var = text_input_var[indices] mel_spec_var = mel_spec_var[indices] linear_spec_var = linear_spec_var[indices] if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() linear_spec_var = linear_spec_var.cuda() mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_spec_var, input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths))) mel_loss = criterion(mel_output, mel_spec_var) #linear_loss = torch.abs(linear_output - linear_spec_var) #linear_loss = 0.5 * \ #torch.mean(linear_loss) + 0.5 * \ #torch.mean(linear_loss[:, :n_priority_freq, :]) linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq]) loss = mel_loss + linear_loss # loss = loss.cuda() loss.backward() grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.) ## TODO: maybe no need optimizer.step() step_time = time.time() - start_time epoch_time += step_time progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0]), ('grad_norm', grad_norm)]) # Plot Learning Stats tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step) tb.add_scalar('Loss/LinearLoss', linear_loss.data[0], current_step) tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step) tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], current_step) tb.add_scalar('Params/GradNorm', grad_norm, current_step) tb.add_scalar('Time/StepTime', step_time, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Attn/Alignment', align_img, current_step) if current_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, linear_loss.data[0], OUT_PATH, current_step, epoch) # Diagnostic visualizations const_spec = linear_output[0].data.cpu().numpy() gt_spec = linear_spec_var[0].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, dataset.ap) gt_spec = plot_spectrogram(gt_spec, dataset.ap) tb.add_image('Spec/Reconstruction', const_spec, current_step) tb.add_image('Spec/GroundTruth', gt_spec, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Attn/Alignment', align_img, current_step) # Sample audio audio_signal = linear_output[0].data.cpu().numpy() dataset.ap.griffin_lim_iters = 60 audio_signal = dataset.ap.inv_spectrogram(audio_signal.T) try: tb.add_audio('SampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: print("\n > Error at audio signal on TB!!") print(audio_signal.max()) print(audio_signal.min()) # average loss after the epoch avg_epoch_loss = np.mean( progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1])) best_loss = save_best_model(model, optimizer, avg_epoch_loss, best_loss, OUT_PATH, current_step, epoch) #lr_scheduler.step(loss.data[0]) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example, max_y, max_x): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _, padded_att_guides) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention, r = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention, r = model(x, m) reduced_guides = [] att_guide_path = hp.attention_path for j, item_id in enumerate(ids): att = np.load(f'{att_guide_path}/{item_id}.npy') reduced = att[0::r] pred_attention = attention[j] n_frames = pred_attention.shape[0] n_phones = pred_attention.shape[-1] # pred_attention = torch.tensor(pred_attention) # reduced = torch.tensor(reduced) padded_guides = pad2d_nonzero(reduced, n_frames, n_phones) #padded_guides = torch.tensor(padded_guides) reduced_guides.append(padded_guides) reduced_guides = torch.tensor(reduced_guides) mask = torch.ne(reduced_guides, -1).type(torch.FloatTensor) mask = torch.tensor(mask) padded_guides = [ pad2d_zero(x, n_frames, n_phones) for x in reduced_guides ] padded_guides = torch.tensor(padded_guides) padded_guides = padded_guides.to(device) attention = attention.to(device) mask = mask.to(device) attention = attention * mask print("guide att shape", att.shape) print(att) print("reduced guide", padded_guides.shape) # print("attention size",n_frames, n_phones) print("mask", mask.shape) print(mask) print(padded_guides.shape, attention.shape, mask.shape) print(attention) print(padded_guides) multiply = torch.pow((attention - padded_guides), 2) print(multiply) #multiply = torch.pow((pred_attention - padded_guides),2)* mask #print(multiply) attention_loss = torch.sum(multiply) print(attention_loss) mask_sum1 = torch.sum(mask) attention_loss /= mask_sum1 print(attention_loss) # batch_attention_losses.append(attention_loss) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) #average_att_loss = sum(batch_attention_losses)/len(batch_attention_losses) #print("attention loss", average_att_loss) #print("m losses", m1_loss, m2_loss) prev_loss = m1_loss + m2_loss print("prev loss", prev_loss) loss = m1_loss + m2_loss + attention_loss print("loss + att", loss) #exit() optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / f'{step}') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')