Ejemplo n.º 1
0
 def train(self, model: Tacotron, optimizer: Optimizer) -> None:
     for i, session_params in enumerate(hp.tts_schedule, 1):
         r, lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set = get_tts_datasets(
                 path=self.paths.data, batch_size=bs, r=r, model_type='tacotron')
             session = TTSSession(
                 index=i, r=r, lr=lr, max_step=max_step,
                 bs=bs, train_set=train_set, val_set=val_set)
             self.train_session(model, optimizer, session)
Ejemplo n.º 2
0
 def train(self, model: Tacotron, optimizer: Optimizer) -> None:
     tts_schedule = self.train_cfg['schedule']
     tts_schedule = parse_schedule(tts_schedule)
     for i, session_params in enumerate(tts_schedule, 1):
         r, lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set = get_tts_datasets(
                 path=self.paths.data,
                 batch_size=bs,
                 r=r,
                 model_type='tacotron',
                 max_mel_len=self.train_cfg['max_mel_len'],
                 filter_attention=False)
             session = TTSSession(index=i,
                                  r=r,
                                  lr=lr,
                                  max_step=max_step,
                                  bs=bs,
                                  train_set=train_set,
                                  val_set=val_set)
             self.train_session(model, optimizer, session=session)
Ejemplo n.º 3
0
                     lstm_dims=hp.tts_lstm_dims,
                     postnet_K=hp.tts_postnet_K,
                     num_highways=hp.tts_num_highways,
                     dropout=hp.tts_dropout).cuda()

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    model.restore(paths.tts_latest_weights)

    # model.reset_step()

    # model.set_r(hp.tts_r)

    optimiser = optim.Adam(model.parameters())

    current_step = model.get_step()

    if not force_gta:

        for session in hp.tts_schedule:

            r, lr, max_step, batch_size = session

            if current_step < max_step:

                train_set, attn_example = get_tts_dataset(
                    paths.data, batch_size, r)

                model.set_r(r)

                training_steps = max_step - current_step
Ejemplo n.º 4
0
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout,
                         stop_threshold=hp.tts_stop_threshold).to(device)

    tts_model.load('quick_start/tts_weights/latest_weights.pyt')

    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
            ]

    voc_k = voc_model.get_step() // 1000
    tts_k = tts_model.get_step() // 1000

    r = tts_model.r

    simple_table([('WaveRNN', str(voc_k) + 'k'),
                  (f'Tacotron(r={r})', str(tts_k) + 'k'),
                  ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', 11_000 if batched else 'N/A'),
                  ('Overlap Samples', 550 if batched else 'N/A')])

    for i, x in enumerate(inputs, 1):

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, attention = tts_model.generate(x)

        if input_text:
Ejemplo n.º 5
0
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train Tacotron TTS')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--force_gta',
                        '-g',
                        action='store_true',
                        help='Force the model to create GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # Load hparams from file
    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    force_train = args.force_train
    force_gta = args.force_gta

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        for session in hp.tts_schedule:
            _, _, _, batch_size = session
            if batch_size % torch.cuda.device_count() != 0:
                raise ValueError(
                    '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    # Instantiate Tacotron Model
    print('\nInitialising Tacotron Model...\n')
    model = Tacotron(embed_dims=hp.tts_embed_dims,
                     num_chars=len(symbols),
                     encoder_dims=hp.tts_encoder_dims,
                     decoder_dims=hp.tts_decoder_dims,
                     n_mels=hp.num_mels,
                     fft_bins=hp.num_mels,
                     postnet_dims=hp.tts_postnet_dims,
                     encoder_K=hp.tts_encoder_K,
                     lstm_dims=hp.tts_lstm_dims,
                     postnet_K=hp.tts_postnet_K,
                     num_highways=hp.tts_num_highways,
                     dropout=hp.tts_dropout,
                     stop_threshold=hp.tts_stop_threshold).to(device)

    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True)

    if not force_gta:
        for i, session in enumerate(hp.tts_schedule):
            current_step = model.get_step()

            r, lr, max_step, batch_size = session

            training_steps = max_step - current_step

            # Do we need to change to the next session?
            if current_step >= max_step:
                # Are there no further sessions than the current one?
                if i == len(hp.tts_schedule) - 1:
                    # There are no more sessions. Check if we force training.
                    if force_train:
                        # Don't finish the loop - train forever
                        training_steps = 999_999_999
                    else:
                        # We have completed training. Breaking is same as continue
                        break
                else:
                    # There is a following session, go to it
                    continue

            model.r = r

            simple_table([('Steps with r=%s' % (repr1(r)),
                           str(training_steps // 1000) + 'k Steps'),
                          ('Batch Size', batch_size), ('Learning Rate', lr),
                          ('Outputs/Step (r)', model.r)])

            train_set, attn_example = get_tts_datasets(paths.data, batch_size,
                                                       r)
            tts_train_loop(paths, model, optimizer, train_set, lr,
                           training_steps, attn_example)

        print('Training Complete.')
        print(
            'To continue training increase tts_total_steps in hparams.py or use --force_train\n'
        )

    print('Creating Ground Truth Aligned Dataset...\n')

    train_set, attn_example = get_tts_datasets(paths.data, 8, model.r)
    create_gta_features(model, train_set, paths.gta)

    print(
        '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n'
    )
Ejemplo n.º 6
0
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr,
                   train_steps, attn_example):
    device = next(
        model.parameters()).device  # use same device as model parameters

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = train_steps // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0

        # Perform 1 epoch
        for i, (x, m, ids, _) in enumerate(train_set, 1):

            x, m = x.to(device), m.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                m1_hat, m2_hat, attention = data_parallel_workaround(
                    model, x, m)
            else:
                m1_hat, m2_hat, attention = model(x, m)

            m1_loss = F.l1_loss(m1_hat, m)
            m2_loss = F.l1_loss(m2_hat, m)

            loss = m1_loss + m2_loss

            optimizer.zero_grad()
            loss.backward()
            if hp.tts_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.tts_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')

            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.tts_checkpoint_every == 0:
                ckpt_name = 'taco_step%sK' % (repr1(k))
                save_checkpoint('tts',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            if attn_example in ids:
                idx = ids.index(attn_example)
                save_attention(np_now(attention[idx][:, :160]),
                               paths.tts_attention / '%s' % (repr1(step)))
                save_spectrogram(np_now(m2_hat[idx]),
                                 paths.tts_mel_plot / '%s' % (repr1(step)),
                                 600)

            msg = '| Epoch: %s/%s (%s/%s) | Loss: %.4f | %.2f steps/s | Step: %sk | ' % (
                repr1(e), repr1(epochs), repr1(i), repr1(total_iters),
                avg_loss, speed, repr1(k))
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('tts', paths, model, optimizer, is_silent=True)
        model.log(paths.tts_log, msg)
        print(' ')
Ejemplo n.º 7
0
    def train_session(self, model: Tacotron, optimizer: Optimizer,
                      session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        model.r = session.r
        simple_table([(f'Steps with r={session.r}',
                       str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr),
                      ('Outputs/Step (r)', model.r)])
        for g in optimizer.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, (x, m, ids, x_lens,
                    mel_lens) in enumerate(session.train_set, 1):
                start = time.time()
                model.train()
                x, m = x.to(device), m.to(device)

                m1_hat, m2_hat, attention = model(x, m)

                m1_loss = F.l1_loss(m1_hat, m)
                m2_loss = F.l1_loss(m2_hat, m)
                loss = m1_loss + m2_loss
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               hp.tts_clip_grad_norm)
                optimizer.step()
                loss_avg.add(loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.tts_checkpoint_every == 0:
                    ckpt_name = f'taco_step{k}K'
                    save_checkpoint('tts',
                                    self.paths,
                                    model,
                                    optimizer,
                                    name=ckpt_name,
                                    is_silent=True)

                if step % hp.tts_plot_every == 0:
                    self.generate_plots(model, session)

                _, att_score = attention_score(attention, mel_lens)
                att_score = torch.mean(att_score)
                self.writer.add_scalar('Attention_Score/train', att_score,
                                       model.get_step())
                self.writer.add_scalar('Loss/train', loss, model.get_step())
                self.writer.add_scalar('Params/reduction_factor', session.r,
                                       model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_loss, val_att_score = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Loss/val', val_loss, model.get_step())
            self.writer.add_scalar('Attention_Score/val', val_att_score,
                                   model.get_step())
            save_checkpoint('tts',
                            self.paths,
                            model,
                            optimizer,
                            is_silent=True)

            loss_avg.reset()
            duration_avg.reset()
            print(' ')
Ejemplo n.º 8
0
class TaiwaneseTacotron():
    def __init__(self):
        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS')
        self.args = parser.parse_args()
        self.args.vocoder = 'wavernn'
        self.args.hp_file = 'hparams.py'
        self.args.voc_weights = False
        self.args.tts_weights = False
        self.args.save_attn = False
        self.args.batched = True
        self.args.target = None
        self.args.overlap = None
        self.args.force_cpu = False
        #================ vocoder ================#
        if self.args.vocoder in ['griffinlim', 'gl']:
            self.args.vocoder = 'griffinlim'
        elif self.args.vocoder in ['wavernn', 'wr']:
            self.args.vocoder = 'wavernn'
        else:
            raise argparse.ArgumentError('Must provide a valid vocoder type!')

        hp.configure(self.args.hp_file)  # Load hparams from file

        # set defaults for any arguments that depend on hparams
        if self.args.vocoder == 'wavernn':
            if self.args.target is None:
                self.args.target = hp.voc_target
            if self.args.overlap is None:
                self.args.overlap = hp.voc_overlap
            if self.args.batched is None:
                self.args.batched = hp.voc_gen_batched

        #================ others ================#
        paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)
        print("hello")
        print(paths.base)
        if not self.args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        # === Wavernn === #
        if self.args.vocoder == 'wavernn':
            print('\nInitialising WaveRNN Model...\n')
            self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                                     fc_dims=hp.voc_fc_dims,
                                     bits=hp.bits,
                                     pad=hp.voc_pad,
                                     upsample_factors=hp.voc_upsample_factors,
                                     feat_dims=hp.num_mels,
                                     compute_dims=hp.voc_compute_dims,
                                     res_out_dims=hp.voc_res_out_dims,
                                     res_blocks=hp.voc_res_blocks,
                                     hop_length=hp.hop_length,
                                     sample_rate=hp.sample_rate,
                                     mode=hp.voc_mode).to(device)

            voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights
            #print(paths.voc_latest_weights)
            self.voc_model.load(voc_load_path)

        # === Tacotron === #
        if hp.tts_model == 'tacotron':
            print('\nInitialising Tacotron Model...\n')
            self.tts_model = Tacotron(
                embed_dims=hp.tts_embed_dims,
                num_chars=len(symbols),
                encoder_dims=hp.tts_encoder_dims,
                decoder_dims=hp.tts_decoder_dims,
                n_mels=hp.num_mels,
                fft_bins=hp.num_mels,
                postnet_dims=hp.tts_postnet_dims,
                encoder_K=hp.tts_encoder_K,
                lstm_dims=hp.tts_lstm_dims,
                postnet_K=hp.tts_postnet_K,
                num_highways=hp.tts_num_highways,
                dropout=hp.tts_dropout,
                stop_threshold=hp.tts_stop_threshold).to(device)

            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Tacotron2 === #
        elif hp.tts_model == 'tacotron2':
            print('\nInitializing Tacotron2 Model...\n')
            self.tts_model = Tacotron2().to(device)
            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Infomation === #
        if hp.tts_model == 'tacotron':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron', str(tts_k) + 'k'),
                              ('r', self.tts_model.r),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])

        elif hp.tts_model == 'tacotron2':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron2', str(tts_k) + 'k'),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron2', str(tts_k) + 'k'),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])

    def generate(self, 華, input_text):
        inputs = [text_to_sequence(input_text.strip(), ['basic_cleaners'])]
        if hp.tts_model == 'tacotron2':
            self.gen_tacotron2(華, inputs)

        elif hp.tts_model == 'tacotron':
            self.gen_tacotron(華, inputs)

        else:
            print(f"Wrong tts model type {{{tts_model_type}}}")

        print('\n\nDone.\n')

    # custom function
    def gen_tacotron2(self, 華, inputs):
        for i, x in enumerate(inputs, 1):
            print(f'\n| Generating {i}/{len(inputs)}')
            print(x)

            x = np.array(x)[None, :]
            x = torch.autograd.Variable(torch.from_numpy(x)).cuda().long()

            self.tts_model.eval()
            mel_outputs, mel_outputs_postnet, _, alignments = self.tts_model.inference(
                x)
            if self.args.vocoder == 'griffinlim':
                v_type = self.args.vocoder
            elif self.args.vocoder == 'wavernn' and self.args.batched:
                v_type = 'wavernn_batched'
            else:
                v_type = 'wavernn_unbatched'

            # == define output name == #
            if len(華) == 0:
                output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0]
            elif 1 <= len(華) <= 9:
                output_name = 華[:-1]
            elif 9 < len(華):
                output_name = 華[:8]
            print(output_name)
            save_path = "output/{}.wav".format(output_name)
            ##

            if self.args.vocoder == 'wavernn':
                m = mel_outputs_postnet
                self.voc_model.generate(m, save_path, self.args.batched,
                                        hp.voc_target, hp.voc_overlap,
                                        hp.mu_law)

            elif self.args.vocoder == 'griffinlim':
                m = torch.squeeze(mel_outputs_postnet).detach().cpu().numpy()
                wav = reconstruct_waveform(m, n_iter=self.args.iters)
                save_wav(wav, save_path)

    # custom function
    def gen_tacotron(self, 華, inputs):
        for i, x in enumerate(inputs, 1):
            print(f'\n| Generating {i}/{len(inputs)}')
            _, m, attention = self.tts_model.generate(x)
            # Fix mel spectrogram scaling to be from 0 to 1
            m = (m + 4) / 8
            np.clip(m, 0, 1, out=m)

            if self.args.vocoder == 'griffinlim':
                v_type = self.args.vocoder
            elif self.args.vocoder == 'wavernn' and self.args.batched:
                v_type = 'wavernn_batched'
            else:
                v_type = 'wavernn_unbatched'
            # == define output name == #
            if len(華) == 0:
                output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0]
            elif 1 <= len(華) <= 9:
                output_name = 華[:-1]
            elif 9 < len(華):
                output_name = 華[:8]
            print(output_name)
            save_path = "output/{}.wav".format(output_name)
            ##
            if self.args.vocoder == 'wavernn':
                m = torch.tensor(m).unsqueeze(0)
                self.voc_model.generate(m, save_path, self.args.batched,
                                        hp.voc_target, hp.voc_overlap,
                                        hp.mu_law)

            elif self.args.vocoder == 'griffinlim':
                wav = reconstruct_waveform(m, n_iter=self.args.iters)
                save_wav(wav, save_path)
Ejemplo n.º 9
0
    def TTS_Wave(self):
        os.makedirs('quick_start/tts_weights/', exist_ok=True)
        os.makedirs('quick_start/voc_weights/', exist_ok=True)

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r')
        zip_ref.extractall('quick_start/voc_weights/')
        zip_ref.close()

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r')
        zip_ref.extractall('quick_start/tts_weights/')
        zip_ref.close()

        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS Generator')
        parser.add_argument('-name', metavar='name', type=str,help='name of pdf')
        parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!')
        parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)')
        parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)')
        parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index')
        parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples')
        parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment')
        parser.set_defaults(batched=hp.voc_gen_batched)
        parser.set_defaults(target=hp.voc_target)
        parser.set_defaults(overlap=hp.voc_overlap)
        parser.set_defaults(input_text=None)
        parser.set_defaults(weights_path=None)
        args = parser.parse_args()

        batched = args.batched
        target = args.target
        overlap = args.overlap
        input_text = args.input_text
        weights_path = args.weights_path

        if not args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
            torch.cuda.set_device(0)
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        print('\nInitialising WaveRNN Model...\n')

        # Instantiate WaveRNN Model
        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                            fc_dims=hp.voc_fc_dims,
                            bits=hp.bits,
                            pad=hp.voc_pad,
                            upsample_factors=hp.voc_upsample_factors,
                            feat_dims=hp.num_mels,
                            compute_dims=hp.voc_compute_dims,
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode='MOL').to(device)

        voc_model.restore('quick_start/voc_weights/latest_weights.pyt')

        print('\nInitialising Tacotron Model...\n')

        # Instantiate Tacotron Model
        tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                             num_chars=len(symbols),
                             encoder_dims=hp.tts_encoder_dims,
                             decoder_dims=hp.tts_decoder_dims,
                             n_mels=hp.num_mels,
                             fft_bins=hp.num_mels,
                             postnet_dims=hp.tts_postnet_dims,
                             encoder_K=hp.tts_encoder_K,
                             lstm_dims=hp.tts_lstm_dims,
                             postnet_K=hp.tts_postnet_K,
                             num_highways=hp.tts_num_highways,
                             dropout=hp.tts_dropout).to(device)


        tts_model.restore('quick_start/tts_weights/latest_weights.pyt')

        if input_text:
            inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
        else:
            with open('final.txt') as f:
                inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f]

        voc_k = voc_model.get_step() // 1000
        tts_k = tts_model.get_step() // 1000

        r = tts_model.get_r()

        simple_table([('WaveRNN', str(voc_k) + 'k'),
                      (f'Tacotron(r={r})', str(tts_k) + 'k'),
                      ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                      ('Target Samples', target if batched else 'N/A'),
                      ('Overlap Samples', overlap if batched else 'N/A')])

        for i, x in enumerate(inputs, 1):

            print("f'\n| Generating {i}/{len(inputs)}'")
            _, m, attention = tts_model.generate(x)

            if input_text:
                save_path = './output_audio/'+str(i)+'.wav'
            else:
                save_path = './output_audio/'+str(i)+'.wav'

            # save_attention(attention, save_path)

            m = torch.tensor(m).unsqueeze(0)
            m = (m + 4) / 8

            voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law)


            if i == 2:

                temp1 = AudioSegment.from_wav("./output_audio/"+str(i-1)+".wav")
                temp2 = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = temp1 + temp2

                os.remove("./output_audio/"+str(i-1)+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")

            elif i > 2:

                preTemp = AudioSegment.from_wav("./output_audio/"+self.path[:-4]+".wav")

                newTemp = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = preTemp + newTemp

                os.remove("./output_audio/"+self.path[:-4]+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")


        print("Done")
Ejemplo n.º 10
0
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr,
                   train_steps, attn_example):
    device = next(
        model.parameters()).device  # use same device as model parameters

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = train_steps // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0

        # Perform 1 epoch
        for i, (x, m, ids, _, att_guides) in enumerate(train_set, 1):

            x, m = x.to(device), m.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                m1_hat, m2_hat, attention, r = data_parallel_workaround(
                    model, x, m)
            else:
                m1_hat, m2_hat, attention, r = model(x, m)

            #print(att_guides.shape)
            orig_attention = attention
            n = int(len(att_guides[0]) / r)
            #print("n", n)
            #reduce guide by r factor
            ga = [a[t] for a in att_guides for t in range(0, len(a), r)]

            assert n == len(attention[0])
            guided_attention = [ga[k:k + n] for k in range(0, len(ga), n)]

            attention = np_now(attention)
            attention = [
                pad2d_nonzero(x, n, len(att_guides[0][0])) for x in attention
            ]

            guided_attention = torch.tensor(guided_attention)
            guided_attention = guided_attention.to(device)

            attention = torch.tensor(attention)
            attention = attention.to(device)

            #create attention mask
            attention_masks = torch.ne(attention, -1).type(torch.FloatTensor)

            attention_masks = torch.tensor(attention_masks)
            attention_masks = attention.to(device)

            multiply = torch.abs(
                attention * guided_attention) * attention_masks

            attention_loss = torch.sum(multiply)

            mask_sum = torch.sum(attention_masks)

            attention_loss /= mask_sum

            m1_loss = F.l1_loss(m1_hat, m)
            m2_loss = F.l1_loss(m2_hat, m)

            #print("mask sum", mask_sum)
            #print("attention loss", attention_loss)
            #print("m losses", m1_loss, m2_loss)
            prev_loss = m1_loss + m2_loss
            #print("prev loss", prev_loss)
            loss = m1_loss + m2_loss + attention_loss
            #print("loss + att", loss)

            optimizer.zero_grad()
            loss.backward()
            if hp.tts_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.tts_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')

            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.tts_checkpoint_every == 0:
                ckpt_name = f'taco_step{k}K'
                save_checkpoint('tts',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            if attn_example in ids:
                idx = ids.index(attn_example)
                save_attention(np_now(orig_attention[idx][:, :160]),
                               paths.tts_attention / f'{step}')
                save_spectrogram(np_now(m2_hat[idx]),
                                 paths.tts_mel_plot / f'{step}', 600)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('tts', paths, model, optimizer, is_silent=True)
        model.log(paths.tts_log, msg)
        print(' ')
Ejemplo n.º 11
0
def tts_train_loop_af_offline(paths: Paths,
                              model: Tacotron,
                              optimizer,
                              train_set,
                              lr,
                              train_steps,
                              attn_example,
                              hp=None):
    # setattr(model, 'mode', 'attention_forcing')
    # import pdb

    def smooth(d, eps=float(1e-10)):
        u = 1.0 / float(d.size()[2])
        return eps * u + (1 - eps) * d

    device = next(
        model.parameters()).device  # use same device as model parameters

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = train_steps // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss_out, running_loss_attn = 0, 0

        # Perform 1 epoch
        for i, (x, m, ids, _, attn_ref) in enumerate(train_set, 1):

            # print(x.size())
            # print(m.size())
            # print(attn_ref.size())
            # # print(m1_hat.size(), m2_hat.size())
            # # print(attention.size(), attention.size(1)*model.r)
            # pdb.set_trace()

            x, m, attn_ref = x.to(device), m.to(device), attn_ref.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                m1_hat, m2_hat, attention = data_parallel_workaround(
                    model, x, m, False, attn_ref)
            else:
                m1_hat, m2_hat, attention = model(x,
                                                  m,
                                                  generate_gta=False,
                                                  attn_ref=attn_ref)

            m1_loss = F.l1_loss(m1_hat, m)
            m2_loss = F.l1_loss(m2_hat, m)
            # attn_loss = F.kl_div(torch.log(smooth(attention)), smooth(attn_ref), reduction='mean') # 'batchmean'
            attn_loss = F.l1_loss(smooth(attention), smooth(attn_ref))

            loss_out = m1_loss + m2_loss
            loss_attn = attn_loss * hp.attn_loss_coeff
            loss = loss_out + loss_attn

            optimizer.zero_grad()
            loss.backward()
            if hp.tts_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.tts_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')

            optimizer.step()

            running_loss_out += loss_out.item()
            avg_loss_out = running_loss_out / i
            running_loss_attn += loss_attn.item()
            avg_loss_attn = running_loss_attn / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.tts_checkpoint_every == 0:
                ckpt_name = f'taco_step{k}K'
                save_checkpoint('tts',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            if attn_example in ids:
                idx = ids.index(attn_example)
                save_attention(np_now(attn_ref[idx][:, :160]),
                               paths.tts_attention / f'{step}_tf')
                save_attention(np_now(attention[idx][:, :160]),
                               paths.tts_attention / f'{step}_af')
                save_spectrogram(np_now(m2_hat[idx]),
                                 paths.tts_mel_plot / f'{step}', 600)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss_out: {avg_loss_out:#.4}; Output_attn: {avg_loss_attn:#.4} | {speed:#.2} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('tts', paths, model, optimizer, is_silent=True)
        model.log(paths.tts_log, msg)
        print(' ')
Ejemplo n.º 12
0
    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    model.restore(paths.tts_latest_weights)

    optimiser = optim.Adam(model.parameters())

    train_set = get_tts_dataset(paths.data, batch_size)

    if not force_gta:

        total_steps = 10_000_000 if force_train else hp.tts_total_steps

        simple_table([
            ('Remaining', str(
                (total_steps - model.get_step()) // 1000) + 'k Steps'),
            ('Batch Size', batch_size), ('Learning Rate', lr)
        ])

        tts_train_loop(model, optimiser, train_set, lr, total_steps)

        print('Training Complete.')
        print(
            'To continue training increase tts_total_steps in hparams.py or use --force_train\n'
        )

    print('Creating Ground Truth Aligned Dataset...\n')

    create_gta_features(model, train_set, paths.gta)

    print(
Ejemplo n.º 13
0
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr,
                   train_steps, attn_example, max_y, max_x):
    device = next(
        model.parameters()).device  # use same device as model parameters

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = train_steps // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0

        # Perform 1 epoch
        for i, (x, m, ids, _, padded_att_guides) in enumerate(train_set, 1):

            x, m = x.to(device), m.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                m1_hat, m2_hat, attention, r = data_parallel_workaround(
                    model, x, m)
            else:
                m1_hat, m2_hat, attention, r = model(x, m)

            reduced_guides = []

            att_guide_path = hp.attention_path
            for j, item_id in enumerate(ids):
                att = np.load(f'{att_guide_path}/{item_id}.npy')
                reduced = att[0::r]

                pred_attention = attention[j]
                n_frames = pred_attention.shape[0]
                n_phones = pred_attention.shape[-1]

                #  pred_attention = torch.tensor(pred_attention)
                # reduced = torch.tensor(reduced)

                padded_guides = pad2d_nonzero(reduced, n_frames, n_phones)
                #padded_guides = torch.tensor(padded_guides)
                reduced_guides.append(padded_guides)

            reduced_guides = torch.tensor(reduced_guides)
            mask = torch.ne(reduced_guides, -1).type(torch.FloatTensor)

            mask = torch.tensor(mask)
            padded_guides = [
                pad2d_zero(x, n_frames, n_phones) for x in reduced_guides
            ]
            padded_guides = torch.tensor(padded_guides)
            padded_guides = padded_guides.to(device)
            attention = attention.to(device)
            mask = mask.to(device)
            attention = attention * mask
            print("guide att shape", att.shape)
            print(att)

            print("reduced guide", padded_guides.shape)

            #   print("attention size",n_frames, n_phones)
            print("mask", mask.shape)
            print(mask)

            print(padded_guides.shape, attention.shape, mask.shape)

            print(attention)
            print(padded_guides)

            multiply = torch.pow((attention - padded_guides), 2)
            print(multiply)

            #multiply = torch.pow((pred_attention - padded_guides),2)* mask
            #print(multiply)

            attention_loss = torch.sum(multiply)
            print(attention_loss)
            mask_sum1 = torch.sum(mask)

            attention_loss /= mask_sum1
            print(attention_loss)

            #    batch_attention_losses.append(attention_loss)

            m1_loss = F.l1_loss(m1_hat, m)
            m2_loss = F.l1_loss(m2_hat, m)

            #average_att_loss = sum(batch_attention_losses)/len(batch_attention_losses)
            #print("attention loss", average_att_loss)
            #print("m losses", m1_loss, m2_loss)
            prev_loss = m1_loss + m2_loss
            print("prev loss", prev_loss)
            loss = m1_loss + m2_loss + attention_loss
            print("loss + att", loss)
            #exit()
            optimizer.zero_grad()
            loss.backward()
            if hp.tts_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.tts_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')

            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.tts_checkpoint_every == 0:
                ckpt_name = f'taco_step{k}K'
                save_checkpoint('tts',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            if attn_example in ids:
                idx = ids.index(attn_example)
                save_attention(np_now(attention[idx][:, :160]),
                               paths.tts_attention / f'{step}')
                save_spectrogram(np_now(m2_hat[idx]),
                                 paths.tts_mel_plot / f'{step}', 600)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('tts', paths, model, optimizer, is_silent=True)
        model.log(paths.tts_log, msg)
        print(' ')