Example #1
0
    def save(self, iteration):
        """Save model checkpoint.

        Args:
            iteration (int): iteration number of the model to be saved.

        Returns:
            None
        """
        io.save_parameters(self.checkpoint_dir, iteration, self.waveflow,
                           self.optimizer)
Example #2
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output,
                                    'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    model = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'],
                    cfg['audio']['num_mels'], cfg['audio']['n_fft'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(
            1 / (cfg['train']['warm_up_step'] *
                 (cfg['train']['learning_rate']**2)),
            cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(
            cfg['train']['grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     optimizer=optimizer,
                                     checkpoint_dir=os.path.join(
                                         args.output, 'checkpoints'),
                                     iteration=args.iteration,
                                     checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(cfg['audio'],
                            place,
                            args.data,
                            cfg['train']['batch_size'],
                            nranks,
                            local_rank,
                            is_vocoder=True).reader()

    for epoch in range(cfg['train']['max_iteration']):
        pbar = tqdm(reader)
        for i, data in enumerate(pbar):
            pbar.set_description('Processing at epoch %d' % epoch)
            mel, mag = data
            mag = dg.to_variable(mag.numpy())
            mel = dg.to_variable(mel.numpy())
            global_step += 1

            mag_pred = model(mel)
            loss = layers.mean(
                layers.abs(layers.elementwise_sub(mag_pred, mag)))

            if parallel:
                loss = model.scale_loss(loss)
                loss.backward()
                model.apply_collective_grads()
            else:
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()

            if local_rank == 0:
                writer.add_scalar('training_loss/loss', loss.numpy(),
                                  global_step)

            # save checkpoint
            if local_rank == 0 and global_step % cfg['train'][
                    'checkpoint_interval'] == 0:
                io.save_parameters(os.path.join(args.output, 'checkpoints'),
                                   global_step, model, optimizer)

    if local_rank == 0:
        writer.close()
Example #3
0
def train(args, config):
    model = create_model(config)
    train_loader, valid_loader = create_data(config, args.input)
    optim = create_optimizer(model, config)

    global global_step
    max_iteration = config["max_iteration"]
    
    iterator = iter(tqdm.tqdm(train_loader))
    while global_step <= max_iteration:
        # get inputs
        try:
            batch = next(iterator)
        except StopIteration:
            iterator = iter(tqdm.tqdm(train_loader))
            batch = next(iterator)
        
        # unzip it
        text_seqs, text_lengths, specs, mels, num_frames = batch

        # forward & backward
        model.train()
        outputs = model(text_seqs, text_lengths, speakers=None, mel=mels)
        decoded, refined, attentions, final_state = outputs

        causal_mel_loss = model.spec_loss(decoded, mels, num_frames)
        non_causal_mel_loss = model.spec_loss(refined, mels, num_frames)
        loss = causal_mel_loss + non_causal_mel_loss
        loss.backward()

        # update
        optim.minimize(loss)

        # logging
        tqdm.tqdm.write("[train] step: {}\tloss: {:.6f}\tcausal:{:.6f}\tnon_causal:{:.6f}".format(
            global_step, 
            loss.numpy()[0], 
            causal_mel_loss.numpy()[0], 
            non_causal_mel_loss.numpy()[0]))
        writer.add_scalar("loss/causal_mel_loss", causal_mel_loss.numpy()[0], step=global_step)
        writer.add_scalar("loss/non_causal_mel_loss", non_causal_mel_loss.numpy()[0], step=global_step)
        writer.add_scalar("loss/loss", loss.numpy()[0], step=global_step)
        
        if global_step % config["report_interval"] == 0:
            text_length = int(text_lengths.numpy()[0])
            num_frame = int(num_frames.numpy()[0])

            tag = "train_mel/ground-truth"
            img = cm.viridis(normalize(mels.numpy()[0, :num_frame].T))
            writer.add_image(tag, img, step=global_step)

            tag = "train_mel/decoded"
            img = cm.viridis(normalize(decoded.numpy()[0, :num_frame].T))
            writer.add_image(tag, img, step=global_step)

            tag = "train_mel/refined"
            img = cm.viridis(normalize(refined.numpy()[0, :num_frame].T))
            writer.add_image(tag, img, step=global_step)

            vocoder = WaveflowVocoder()
            vocoder.model.eval()

            tag = "train_audio/ground-truth-waveflow"
            wav = vocoder(F.transpose(mels[0:1, :num_frame, :], (0, 2, 1)))
            writer.add_audio(tag, wav.numpy()[0], step=global_step, sample_rate=22050)

            tag = "train_audio/decoded-waveflow"
            wav = vocoder(F.transpose(decoded[0:1, :num_frame, :], (0, 2, 1)))
            writer.add_audio(tag, wav.numpy()[0], step=global_step, sample_rate=22050)

            tag = "train_audio/refined-waveflow"
            wav = vocoder(F.transpose(refined[0:1, :num_frame, :], (0, 2, 1)))
            writer.add_audio(tag, wav.numpy()[0], step=global_step, sample_rate=22050)
            
            attentions_np = attentions.numpy()
            attentions_np = attentions_np[:, 0, :num_frame // 4 , :text_length]
            for i, attention_layer in enumerate(np.rot90(attentions_np, axes=(1,2))):
                tag = "train_attention/layer_{}".format(i)
                img = cm.viridis(normalize(attention_layer))
                writer.add_image(tag, img, step=global_step, dataformats="HWC")

        if global_step % config["save_interval"] == 0:
            save_parameters(writer.logdir, global_step, model, optim)

        # global step +1
        global_step += 1
Example #4
0
    while global_step <= max_iterations:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm.tqdm(train_loader))
            batch = next(iterator)

        audio_clips, mel_specs, audio_starts = batch

        model.train()
        y_var = model(audio_clips, mel_specs, audio_starts)
        loss_var = model.loss(y_var, audio_clips)
        loss_var.backward()
        loss_np = loss_var.numpy()

        writer.add_scalar("loss", loss_np[0], global_step)
        writer.add_scalar("learning_rate",
                          optim._learning_rate.step().numpy()[0], global_step)
        optim.minimize(loss_var)
        optim.clear_gradients()
        print("global_step: {}\tloss: {:<8.6f}".format(global_step,
                                                       loss_np[0]))

        if global_step % snap_interval == 0:
            valid_model(model, valid_loader, writer, global_step, sample_rate)

        if global_step % checkpoint_interval == 0:
            io.save_parameters(checkpoint_dir, global_step, model, optim)

        global_step += 1
Example #5
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(dg.parallel.Env()
                            .dev_id) if args.use_gpu else fluid.CPUPlace()
    fluid.enable_dygraph(place)

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output,
                                        'log')) if local_rank == 0 else None

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
                                        (cfg['train']['learning_rate']**2)),
                                   cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
            'grad_clip_thresh']))
    reader = LJSpeechLoader(
        cfg['audio'],
        place,
        args.data,
        args.alignments_path,
        cfg['train']['batch_size'],
        nranks,
        local_rank,
        shuffle=True).reader
    iterator = iter(tqdm(reader))

    # Load parameters.
    global_step = io.load_parameters(
        model=model,
        optimizer=optimizer,
        checkpoint_dir=os.path.join(args.output, 'checkpoints'),
        iteration=args.iteration,
        checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    while global_step <= cfg['train']['max_iteration']:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm(reader))
            batch = next(iterator)

        (character, mel, pos_text, pos_mel, alignment) = batch

        global_step += 1

        #Forward
        result = model(
            character, pos_text, mel_pos=pos_mel, length_target=alignment)
        mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
        mel_loss = layers.mse_loss(mel_output, mel)
        mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
        duration_loss = layers.mean(
            layers.abs(
                layers.elementwise_sub(duration_predictor_output, alignment)))
        total_loss = mel_loss + mel_postnet_loss + duration_loss

        if local_rank == 0:
            writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
            writer.add_scalar('post_mel_loss',
                              mel_postnet_loss.numpy(), global_step)
            writer.add_scalar('duration_loss',
                              duration_loss.numpy(), global_step)
            writer.add_scalar('learning_rate',
                              optimizer._learning_rate.step().numpy(),
                              global_step)

        if parallel:
            total_loss = model.scale_loss(total_loss)
            total_loss.backward()
            model.apply_collective_grads()
        else:
            total_loss.backward()
        optimizer.minimize(total_loss)
        model.clear_gradients()

        # save checkpoint
        if local_rank == 0 and global_step % cfg['train'][
                'checkpoint_interval'] == 0:
            io.save_parameters(
                os.path.join(args.output, 'checkpoints'), global_step, model,
                optimizer)

    if local_rank == 0:
        writer.close()
Example #6
0
        # record learning rate before updating
        if env.local_rank == 0:
            writer.add_scalar("learning_rate",
                              optim._learning_rate.step().numpy(), global_step)
        optim.minimize(l)
        optim.clear_gradients()

        # record step losses
        step_loss = {k: v.numpy()[0] for k, v in losses.items()}

        if env.local_rank == 0:
            tqdm.tqdm.write("[Train] global_step: {}\tloss: {}".format(
                global_step, step_loss["loss"]))
            for k, v in step_loss.items():
                writer.add_scalar(k, v, global_step)

        # train state saving, the first sentence in the batch
        if env.local_rank == 0 and global_step % snap_interval == 0:
            input_specs = (mel_specs, lin_specs)
            state_saver(outputs, input_specs, global_step)

        # evaluation
        if env.local_rank == 0 and global_step % eval_interval == 0:
            evaluator(model, global_step)

        # save checkpoint
        if env.local_rank == 0 and global_step % save_interval == 0:
            save_parameters(ckpt_dir, global_step, model, optim)

        global_step += 1
Example #7
0
                ]
                for idx, sent in enumerate(sentences):
                    wav, attn = eval_model(
                        dv3, sent, replace_pronounciation_prob, min_level_db,
                        ref_level_db, power, n_iter, win_length, hop_length,
                        preemphasis)
                    wav_path = os.path.join(
                        state_dir, "waveform",
                        "eval_sample_{:09d}.wav".format(global_step))
                    sf.write(wav_path, wav, sample_rate)
                    writer.add_audio(
                        "eval_sample_{}".format(idx),
                        wav,
                        global_step,
                        sample_rate=sample_rate)
                    attn_path = os.path.join(
                        state_dir, "alignments",
                        "eval_sample_attn_{:09d}.png".format(global_step))
                    plot_alignment(attn, attn_path)
                    writer.add_image(
                        "eval_sample_attn{}".format(idx),
                        cm.viridis(attn),
                        global_step,
                        dataformats="HWC")

            # save checkpoint
            if global_step % save_interval == 0:
                io.save_parameters(ckpt_dir, global_step, dv3, optim)

            global_step += 1
Example #8
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output,
                                    'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    network_cfg = cfg['network']
    model = TransformerTTS(
        network_cfg['embedding_size'], network_cfg['hidden_size'],
        network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
        cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
        network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
                                        (cfg['train']['learning_rate']**2)),
                                   cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
            'grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(
        model=model,
        optimizer=optimizer,
        checkpoint_dir=os.path.join(args.output, 'checkpoints'),
        iteration=args.iteration,
        checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(
        cfg['audio'],
        place,
        args.data,
        cfg['train']['batch_size'],
        nranks,
        local_rank,
        shuffle=True).reader

    iterator = iter(tqdm(reader))

    global_step += 1

    while global_step <= cfg['train']['max_iteration']:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm(reader))
            batch = next(iterator)

        character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch

        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            character, mel_input, pos_text, pos_mel)

        mel_loss = layers.mean(
            layers.abs(layers.elementwise_sub(mel_pred, mel)))
        post_mel_loss = layers.mean(
            layers.abs(layers.elementwise_sub(postnet_pred, mel)))
        loss = mel_loss + post_mel_loss

        stop_loss = cross_entropy(
            stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight'])
        loss = loss + stop_loss

        if local_rank == 0:
            writer.add_scalar('training_loss/mel_loss',
                              mel_loss.numpy(),
                              global_step)
            writer.add_scalar('training_loss/post_mel_loss',
                              post_mel_loss.numpy(),
                              global_step)
            writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)

            if parallel:
                writer.add_scalar('alphas/encoder_alpha',
                                   model._layers.encoder.alpha.numpy(),
                                   global_step)
                writer.add_scalar('alphas/decoder_alpha',
                                   model._layers.decoder.alpha.numpy(),
                                   global_step)
            else:
                writer.add_scalar('alphas/encoder_alpha',
                                   model.encoder.alpha.numpy(),
                                   global_step)
                writer.add_scalar('alphas/decoder_alpha',
                                   model.decoder.alpha.numpy(),
                                   global_step)

            writer.add_scalar('learning_rate',
                              optimizer._learning_rate.step().numpy(),
                              global_step)

            if global_step % cfg['train']['image_interval'] == 1:
                for i, prob in enumerate(attn_probs):
                    for j in range(cfg['network']['decoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_%d_0' % global_step,
                            x,
                            i * 4 + j)

                for i, prob in enumerate(attn_enc):
                    for j in range(cfg['network']['encoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_enc_%d_0' % global_step,
                            x,
                            i * 4 + j)

                for i, prob in enumerate(attn_dec):
                    for j in range(cfg['network']['decoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_dec_%d_0' % global_step,
                            x,
                            i * 4 + j)

        if parallel:
            loss = model.scale_loss(loss)
            loss.backward()
            model.apply_collective_grads()
        else:
            loss.backward()
        optimizer.minimize(loss)
        model.clear_gradients()

        # save checkpoint
        if local_rank == 0 and global_step % cfg['train'][
                'checkpoint_interval'] == 0:
            io.save_parameters(
                os.path.join(args.output, 'checkpoints'), global_step, model,
                optimizer)
        global_step += 1

    if local_rank == 0:
        writer.close()
Example #9
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output,
                                        'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    network_cfg = cfg['network']
    model = TransformerTTS(
        network_cfg['embedding_size'], network_cfg['hidden_size'],
        network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
        cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
        network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(
            1 / (cfg['train']['warm_up_step'] *
                 (cfg['train']['learning_rate']**2)),
            cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(
            cfg['train']['grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     optimizer=optimizer,
                                     checkpoint_dir=os.path.join(
                                         args.output, 'checkpoints'),
                                     iteration=args.iteration,
                                     checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(cfg['audio'],
                            place,
                            args.data,
                            cfg['train']['batch_size'],
                            nranks,
                            local_rank,
                            shuffle=True).reader()

    for epoch in range(cfg['train']['max_epochs']):
        pbar = tqdm(reader)
        for i, data in enumerate(pbar):
            pbar.set_description('Processing at epoch %d' % epoch)
            character, mel, mel_input, pos_text, pos_mel = data

            global_step += 1

            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                character, mel_input, pos_text, pos_mel)

            mel_loss = layers.mean(
                layers.abs(layers.elementwise_sub(mel_pred, mel)))
            post_mel_loss = layers.mean(
                layers.abs(layers.elementwise_sub(postnet_pred, mel)))
            loss = mel_loss + post_mel_loss

            # Note: When used stop token loss the learning did not work.
            if cfg['network']['stop_token']:
                label = (pos_mel == 0).astype(np.float32)
                stop_loss = cross_entropy(stop_preds, label)
                loss = loss + stop_loss

            if local_rank == 0:
                writer.add_scalars(
                    'training_loss', {
                        'mel_loss': mel_loss.numpy(),
                        'post_mel_loss': post_mel_loss.numpy()
                    }, global_step)

                if cfg['network']['stop_token']:
                    writer.add_scalar('stop_loss', stop_loss.numpy(),
                                      global_step)

                if parallel:
                    writer.add_scalars(
                        'alphas', {
                            'encoder_alpha':
                            model._layers.encoder.alpha.numpy(),
                            'decoder_alpha':
                            model._layers.decoder.alpha.numpy(),
                        }, global_step)
                else:
                    writer.add_scalars(
                        'alphas', {
                            'encoder_alpha': model.encoder.alpha.numpy(),
                            'decoder_alpha': model.decoder.alpha.numpy(),
                        }, global_step)

                writer.add_scalar('learning_rate',
                                  optimizer._learning_rate.step().numpy(),
                                  global_step)

                if global_step % cfg['train']['image_interval'] == 1:
                    for i, prob in enumerate(attn_probs):
                        for j in range(cfg['network']['decoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_%d_0' % global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

                    for i, prob in enumerate(attn_enc):
                        for j in range(cfg['network']['encoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_enc_%d_0' %
                                             global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

                    for i, prob in enumerate(attn_dec):
                        for j in range(cfg['network']['decoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_dec_%d_0' %
                                             global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

            if parallel:
                loss = model.scale_loss(loss)
                loss.backward()
                model.apply_collective_grads()
            else:
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()

            # save checkpoint
            if local_rank == 0 and global_step % cfg['train'][
                    'checkpoint_interval'] == 0:
                io.save_parameters(os.path.join(args.output, 'checkpoints'),
                                   global_step, model, optimizer)

    if local_rank == 0:
        writer.close()