Ejemplo n.º 1
0
    def generate_plots(self, model: ForwardTacotron,
                       session: ForwardSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, x_lens, mel_lens, dur = session.val_sample
        x, m, dur, mel_lens = x.to(device), m.to(device), dur.to(
            device), mel_lens.to(device)

        m1_hat, m2_hat, dur_hat = model(x, m, dur, mel_lens)
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)
        # pitch_fig = plot_pitch(np_now(pitch[0]))
        # pitch_gta_fig = plot_pitch(np_now(pitch_hat.squeeze()[0]))

        # self.writer.add_figure('Pitch/target', pitch_fig, model.step)
        # self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, dur_hat = model.generate(x[0, :x_lens[0]].tolist())
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        # pitch_gen_fig = plot_pitch(np_now(pitch_hat.squeeze()))

        # self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step)
        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Ejemplo n.º 2
0
    def generate_plots(self, model: Tacotron, session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, x_lens, m_lens = session.val_sample
        x, m = x.to(device), m.to(device)

        m1_hat, m2_hat, att = model(x, m)
        att = np_now(att)[0]
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        att_fig = plot_attention(att)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)

        self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, att = model.generate(x[0].tolist(),
                                             steps=m_lens[0] + 20)
        att_fig = plot_attention(att)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        self.writer.add_figure('Generated/attention', att_fig, model.step)
        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Ejemplo n.º 3
0
    def generate_plots(self, model: ForwardTacotron,
                       session: TTSSession) -> None:
        model.eval()
        device = next(model.parameters()).device
        x, m, ids, lens, dur = session.val_sample
        x, m, dur = x.to(device), m.to(device), dur.to(device)

        m1_hat, m2_hat, dur_hat = model(x, m, dur)
        m1_hat = np_now(m1_hat)[0, :600, :]
        m2_hat = np_now(m2_hat)[0, :600, :]
        m = np_now(m)[0, :600, :]

        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)
        m_fig = plot_mel(m)

        self.writer.add_figure('Ground_Truth_Aligned/target', m_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig,
                               model.step)
        self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig,
                               model.step)

        m1_hat, m2_hat, m = rescale_mel(m1_hat), rescale_mel(
            m2_hat), rescale_mel(m)
        m2_hat_wav = reconstruct_waveform(m2_hat)
        target_wav = reconstruct_waveform(m)

        self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)

        m1_hat, m2_hat, dur_hat = model.generate(x[0].tolist())
        m1_hat, m2_hat = rescale_mel(m1_hat), rescale_mel(m2_hat)
        m1_hat_fig = plot_mel(m1_hat)
        m2_hat_fig = plot_mel(m2_hat)

        self.writer.add_figure('Generated/target', m_fig, model.step)
        self.writer.add_figure('Generated/linear', m1_hat_fig, model.step)
        self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step)

        m2_hat_wav = reconstruct_waveform(m2_hat)

        self.writer.add_audio(tag='Generated/target_wav',
                              snd_tensor=target_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
        self.writer.add_audio(tag='Generated/postnet_wav',
                              snd_tensor=m2_hat_wav,
                              global_step=model.step,
                              sample_rate=hp.sample_rate)
Ejemplo n.º 4
0
def create_align_features(model: Tacotron, train_set: DataLoader,
                          val_set: DataLoader, save_path: Path):
    assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \
                         f'Reduction factor was: {model.r}'
    model.eval()
    device = next(
        model.parameters()).device  # use same device as model parameters
    iters = len(val_set) + len(train_set)
    dataset = itertools.chain(train_set, val_set)
    for i, (x, mels, ids, mel_lens) in enumerate(dataset, 1):
        x, mels = x.to(device), mels.to(device)
        with torch.no_grad():
            _, _, attn = model(x, mels)
        attn = np_now(attn)
        bs, chars = attn.shape[0], attn.shape[2]
        argmax = np.argmax(attn[:, :, :], axis=2)
        mel_counts = np.zeros(shape=(bs, chars), dtype=np.int32)
        for b in range(attn.shape[0]):
            # fix random jumps in attention
            for j in range(1, argmax.shape[1]):
                if abs(argmax[b, j] - argmax[b, j - 1]) > 10:
                    argmax[b, j] = argmax[b, j - 1]
            count = np.bincount(argmax[b, :mel_lens[b]])
            mel_counts[b, :len(count)] = count[:len(count)]

        for j, item_id in enumerate(ids):
            np.save(str(save_path / f'{item_id}.npy'),
                    mel_counts[j, :],
                    allow_pickle=False)
        bar = progbar(i, iters)
        msg = f'{bar} {i}/{iters} Batches '
        stream(msg)
Ejemplo n.º 5
0
def create_align_features(
    model: Tacotron,
    train_set: DataLoader,
    val_set: DataLoader,
    save_path_alg: Path,
    #   save_path_pitch: Path
):
    assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \
                         f'Reduction factor was: {model.r}'
    model.eval()
    device = next(
        model.parameters()).device  # use same device as model parameters
    if val_set is not None:
        iters = len(val_set) + len(train_set)
        dataset = itertools.chain(train_set, val_set)
    else:
        # print('here')
        iters = len(train_set)
        # print(iters)
        dataset = itertools.chain(train_set)

    att_score_dict = {}

    if hp.extract_durations_with_dijkstra:
        print('Extracting durations using dijkstra...')

        dur_extraction_func = extract_durations_with_dijkstra

    else:
        print('Extracting durations using attention peak counts...')
        dur_extraction_func = extract_durations_per_count
    # for i in dataset:
    # print(i)
    for i, (x, mels, ids, x_lens, mel_lens) in enumerate(dataset, 1):
        x, mels = x.to(device), mels.to(device)
        # print(x)
        # print(mels)
        with torch.no_grad():
            _, _, att_batch = model(x, mels)
        align_score, sharp_score = attention_score(att_batch, mel_lens, r=1)
        att_batch = np_now(att_batch)
        seq, att, mel_len, item_id = x[0], att_batch[0], mel_lens[0], ids[0]
        align_score, sharp_score = float(align_score[0]), float(sharp_score[0])
        att_score_dict[item_id] = (align_score, sharp_score)
        durs = dur_extraction_func(seq, att, mel_len)
        if np.sum(durs) != mel_len:
            print(
                f'WARNINNG: Sum of durations did not match mel length for item {item_id}!'
            )
        np.save(str(save_path_alg / f'{item_id}.npy'),
                durs,
                allow_pickle=False)
        bar = progbar(i, iters)
        msg = f'{bar} {i}/{iters} Batches '
        stream(msg)
    pickle_binary(att_score_dict, paths.data / 'att_score_dict.pkl')