def generate_plots(self, model: ForwardTacotron, session: ForwardSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, x_lens, mel_lens, dur = session.val_sample x, m, dur, mel_lens = x.to(device), m.to(device), dur.to( device), mel_lens.to(device) m1_hat, m2_hat, dur_hat = model(x, m, dur, mel_lens) m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) # pitch_fig = plot_pitch(np_now(pitch[0])) # pitch_gta_fig = plot_pitch(np_now(pitch_hat.squeeze()[0])) # self.writer.add_figure('Pitch/target', pitch_fig, model.step) # self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, dur_hat = model.generate(x[0, :x_lens[0]].tolist()) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) # pitch_gen_fig = plot_pitch(np_now(pitch_hat.squeeze())) # self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def generate_plots(self, model: Tacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, x_lens, m_lens = session.val_sample x, m = x.to(device), m.to(device) m1_hat, m2_hat, att = model(x, m) att = np_now(att)[0] m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, att = model.generate(x[0].tolist(), steps=m_lens[0] + 20) att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/attention', att_fig, model.step) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def generate_plots(self, model: ForwardTacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, lens, dur = session.val_sample x, m, dur = x.to(device), m.to(device), dur.to(device) m1_hat, m2_hat, dur_hat = model(x, m, dur) m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m1_hat, m2_hat, m = rescale_mel(m1_hat), rescale_mel( m2_hat), rescale_mel(m) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, dur_hat = model.generate(x[0].tolist()) m1_hat, m2_hat = rescale_mel(m1_hat), rescale_mel(m2_hat) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def create_align_features(model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path: Path): assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \ f'Reduction factor was: {model.r}' model.eval() device = next( model.parameters()).device # use same device as model parameters iters = len(val_set) + len(train_set) dataset = itertools.chain(train_set, val_set) for i, (x, mels, ids, mel_lens) in enumerate(dataset, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, _, attn = model(x, mels) attn = np_now(attn) bs, chars = attn.shape[0], attn.shape[2] argmax = np.argmax(attn[:, :, :], axis=2) mel_counts = np.zeros(shape=(bs, chars), dtype=np.int32) for b in range(attn.shape[0]): # fix random jumps in attention for j in range(1, argmax.shape[1]): if abs(argmax[b, j] - argmax[b, j - 1]) > 10: argmax[b, j] = argmax[b, j - 1] count = np.bincount(argmax[b, :mel_lens[b]]) mel_counts[b, :len(count)] = count[:len(count)] for j, item_id in enumerate(ids): np.save(str(save_path / f'{item_id}.npy'), mel_counts[j, :], allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def create_align_features( model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path_alg: Path, # save_path_pitch: Path ): assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \ f'Reduction factor was: {model.r}' model.eval() device = next( model.parameters()).device # use same device as model parameters if val_set is not None: iters = len(val_set) + len(train_set) dataset = itertools.chain(train_set, val_set) else: # print('here') iters = len(train_set) # print(iters) dataset = itertools.chain(train_set) att_score_dict = {} if hp.extract_durations_with_dijkstra: print('Extracting durations using dijkstra...') dur_extraction_func = extract_durations_with_dijkstra else: print('Extracting durations using attention peak counts...') dur_extraction_func = extract_durations_per_count # for i in dataset: # print(i) for i, (x, mels, ids, x_lens, mel_lens) in enumerate(dataset, 1): x, mels = x.to(device), mels.to(device) # print(x) # print(mels) with torch.no_grad(): _, _, att_batch = model(x, mels) align_score, sharp_score = attention_score(att_batch, mel_lens, r=1) att_batch = np_now(att_batch) seq, att, mel_len, item_id = x[0], att_batch[0], mel_lens[0], ids[0] align_score, sharp_score = float(align_score[0]), float(sharp_score[0]) att_score_dict[item_id] = (align_score, sharp_score) durs = dur_extraction_func(seq, att, mel_len) if np.sum(durs) != mel_len: print( f'WARNINNG: Sum of durations did not match mel length for item {item_id}!' ) np.save(str(save_path_alg / f'{item_id}.npy'), durs, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg) pickle_binary(att_score_dict, paths.data / 'att_score_dict.pkl')