Beispiel #1
0
    def setup_model(self):
        config = self.config
        frontend = English()
        model = TransformerTTS(
            frontend,
            d_encoder=config.model.d_encoder,
            d_decoder=config.model.d_decoder,
            d_mel=config.data.d_mel,
            n_heads=config.model.n_heads,
            d_ffn=config.model.d_ffn,
            encoder_layers=config.model.encoder_layers,
            decoder_layers=config.model.decoder_layers,
            d_prenet=config.model.d_prenet,
            d_postnet=config.model.d_postnet,
            postnet_layers=config.model.postnet_layers,
            postnet_kernel_size=config.model.postnet_kernel_size,
            max_reduction_factor=config.model.max_reduction_factor,
            decoder_prenet_dropout=config.model.decoder_prenet_dropout,
            dropout=config.model.dropout)
        if self.parallel:
            model = paddle.DataParallel(model)
        optimizer = paddle.optimizer.Adam(learning_rate=config.training.lr,
                                          beta1=0.9,
                                          beta2=0.98,
                                          epsilon=1e-9,
                                          parameters=model.parameters())
        criterion = TransformerTTSLoss(config.model.stop_loss_scale)
        drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
        reduction_factor = scheduler.StepWise(config.training.reduction_factor)

        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.drop_n_heads = drop_n_heads
        self.reduction_factor = reduction_factor
Beispiel #2
0
def main(config, args):
    paddle.set_device(args.device)

    # model
    frontend = English()
    model = TransformerTTS.from_pretrained(frontend, config,
                                           args.checkpoint_path)
    model.eval()

    # inputs
    input_path = Path(args.input).expanduser()
    with open(input_path, "rt") as f:
        sentences = f.readlines()

    output_dir = Path(args.output).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)

    for i, sentence in enumerate(sentences):
        outputs = model.predict(sentence, verbose=args.verbose)
        mel_output = outputs["mel_output"]
        # cross_attention_weights = outputs["cross_attention_weights"]
        mel_output = mel_output.T  #(C, T)
        np.save(str(output_dir / f"sentence_{i}"), mel_output)
        if args.verbose:
            print("spectrogram saved at {}".format(output_dir /
                                                   f"sentence_{i}.npy"))
Beispiel #3
0
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-120000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")

        tts_config_path = os.path.join(self.directory, "assets", "tts",
                                       "ljspeech.yaml")
        with open(tts_config_path) as f:
            self.tts_config = yaml.load(f, Loader=yaml.Loader)

        # The max length of audio when synthsis.
        self.max_len = 1000
        # The threshold of stop token which indicates the time step should stop generate spectrum or not.
        self.stop_threshold = 0.5

        with fluid.dygraph.guard(fluid.CPUPlace()):
            # Build TTS.
            with fluid.unique_name.guard():
                network_cfg = self.tts_config['network']
                self.tts_model = TransformerTTSModel(
                    network_cfg['embedding_size'], network_cfg['hidden_size'],
                    network_cfg['encoder_num_head'],
                    network_cfg['encoder_n_layers'],
                    self.tts_config['audio']['num_mels'],
                    network_cfg['outputs_per_step'],
                    network_cfg['decoder_num_head'],
                    network_cfg['decoder_n_layers'])
                io.load_parameters(model=self.tts_model,
                                   checkpoint_path=self.tts_checkpoint_path)

            # Build vocoder.
            args = AttrDict()
            args.config = self.waveflow_config_path
            args.use_fp16 = False
            self.waveflow_config = io.add_yaml_config_to_args(args)
            self.waveflow = WaveFlowModule(self.waveflow_config)
            io.load_parameters(model=self.waveflow,
                               checkpoint_path=self.waveflow_checkpoint_path)
Beispiel #4
0
def alignments(args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    with dg.guard(place):
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

        # get text data
        root = Path(args.data)
        csv_path = root.joinpath("metadata.csv")
        table = pd.read_csv(csv_path,
                            sep="|",
                            header=None,
                            quoting=csv.QUOTE_NONE,
                            names=["fname", "raw_text", "normalized_text"])
        ljspeech_processor = audio.AudioProcessor(
            sample_rate=cfg['audio']['sr'],
            num_mels=cfg['audio']['num_mels'],
            min_level_db=cfg['audio']['min_level_db'],
            ref_level_db=cfg['audio']['ref_level_db'],
            n_fft=cfg['audio']['n_fft'],
            win_length=cfg['audio']['win_length'],
            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
            symmetric_norm=False,
            max_norm=1.,
            mel_fmin=0,
            mel_fmax=None,
            clip_norm=True,
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)

        pbar = tqdm(range(len(table)))
        alignments = OrderedDict()
        for i in pbar:
            fname, raw_text, normalized_text = table.iloc[i]
            # init input
            text = np.asarray(text_to_sequence(normalized_text))
            text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
            pos_text = np.arange(1, text.shape[1] + 1)
            pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
            wav = ljspeech_processor.load_wav(
                os.path.join(args.data, 'wavs', fname + ".wav"))
            mel_input = ljspeech_processor.melspectrogram(wav).astype(
                np.float32)
            mel_input = np.transpose(mel_input, axes=(1, 0))
            mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0])
            mel_lens = mel_input.shape[1]

            dec_slf_mask = get_triu_tensor(mel_input,
                                           mel_input).astype(np.float32)
            dec_slf_mask = np.expand_dims(dec_slf_mask, axis=0)
            dec_slf_mask = fluid.layers.cast(dg.to_variable(dec_slf_mask != 0),
                                             np.float32) * (-2**32 + 1)
            pos_mel = np.arange(1, mel_input.shape[1] + 1)
            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                text, mel_input, pos_text, pos_mel, dec_slf_mask)
            mel_input = fluid.layers.concat(
                [mel_input, postnet_pred[:, -1:, :]], axis=1)

            alignment, _ = get_alignment(attn_probs, mel_lens,
                                         network_cfg['decoder_num_head'])
            alignments[fname] = alignment
        with open(args.output + '.txt', "wb") as f:
            pickle.dump(alignments, f)
Beispiel #5
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    fluid.enable_dygraph(place)
    with fluid.unique_name.guard():
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

    with fluid.unique_name.guard():
        model_vocoder = Vocoder(cfg['train']['batch_size'],
                                cfg['vocoder']['hidden_size'],
                                cfg['audio']['num_mels'],
                                cfg['audio']['n_fft'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model_vocoder, checkpoint_path=args.checkpoint_vocoder)
        model_vocoder.eval()
    # init input
    text = np.asarray(text_to_sequence(text_input))
    text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
    mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

    pbar = tqdm(range(args.max_len))
    for i in pbar:
        pos_mel = np.arange(1, mel_input.shape[1] + 1)
        pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            text, mel_input, pos_text, pos_mel)
        mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]],
                                        axis=1)

    mag_pred = model_vocoder(postnet_pred)

    _ljspeech_processor = audio.AudioProcessor(
        sample_rate=cfg['audio']['sr'],
        num_mels=cfg['audio']['num_mels'],
        min_level_db=cfg['audio']['min_level_db'],
        ref_level_db=cfg['audio']['ref_level_db'],
        n_fft=cfg['audio']['n_fft'],
        win_length=cfg['audio']['win_length'],
        hop_length=cfg['audio']['hop_length'],
        power=cfg['audio']['power'],
        preemphasis=cfg['audio']['preemphasis'],
        signal_norm=True,
        symmetric_norm=False,
        max_norm=1.,
        mel_fmin=0,
        mel_fmax=None,
        clip_norm=True,
        griffin_lim_iters=60,
        do_trim_silence=False,
        sound_norm=False)

    # synthesis with cbhg
    wav = _ljspeech_processor.inv_spectrogram(
        fluid.layers.transpose(fluid.layers.squeeze(mag_pred, [0]),
                               [1, 0]).numpy())
    global_step = 0
    for i, prob in enumerate(attn_probs):
        for j in range(4):
            x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
            writer.add_image('Attention_%d_0' % global_step,
                             x,
                             i * 4 + j,
                             dataformats="HWC")

    writer.add_audio(text_input + '(cbhg)', wav, 0, cfg['audio']['sr'])

    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(os.path.join(os.path.join(args.output, 'samples'), 'cbhg.wav'),
          cfg['audio']['sr'], wav)

    # synthesis with griffin-lim
    wav = _ljspeech_processor.inv_melspectrogram(
        fluid.layers.transpose(fluid.layers.squeeze(postnet_pred, [0]),
                               [1, 0]).numpy())
    writer.add_audio(text_input + '(griffin)', wav, 0, cfg['audio']['sr'])

    write(os.path.join(os.path.join(args.output, 'samples'), 'griffin.wav'),
          cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Beispiel #6
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    fluid.enable_dygraph(place)
    with fluid.unique_name.guard():
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

    # init input
    text = np.asarray(text_to_sequence(text_input))
    text = fluid.layers.unsqueeze(dg.to_variable(text).astype(np.int64), [0])
    mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = fluid.layers.unsqueeze(
        dg.to_variable(pos_text).astype(np.int64), [0])

    for i in range(args.max_len):
        pos_mel = np.arange(1, mel_input.shape[1] + 1)
        pos_mel = fluid.layers.unsqueeze(
            dg.to_variable(pos_mel).astype(np.int64), [0])
        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            text, mel_input, pos_text, pos_mel)
        if stop_preds.numpy()[0, -1] > args.stop_threshold:
            break
        mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]],
                                        axis=1)
    global_step = 0
    for i, prob in enumerate(attn_probs):
        for j in range(4):
            x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
            writer.add_image('Attention_%d_0' % global_step,
                             x,
                             i * 4 + j,
                             dataformats="HWC")

    if args.vocoder == 'griffin-lim':
        #synthesis use griffin-lim
        wav = synthesis_with_griffinlim(postnet_pred, cfg['audio'])
    elif args.vocoder == 'waveflow':
        # synthesis use waveflow
        wav = synthesis_with_waveflow(postnet_pred, args,
                                      args.checkpoint_vocoder, place)
    else:
        print(
            'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
            % args.vocoder)

    writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(
        os.path.join(os.path.join(args.output, 'samples'),
                     args.vocoder + '.wav'), cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Beispiel #7
0
def alignments(args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    with dg.guard(place):
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

        # get text data
        root = Path(args.data)
        csv_path = root.joinpath("metadata.csv")
        table = pd.read_csv(
            csv_path,
            sep="|",
            header=None,
            quoting=csv.QUOTE_NONE,
            names=["fname", "raw_text", "normalized_text"])

        pbar = tqdm(range(len(table)))
        alignments = OrderedDict()
        for i in pbar:
            fname, raw_text, normalized_text = table.iloc[i]
            # init input
            text = np.asarray(text_to_sequence(normalized_text))
            text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
            pos_text = np.arange(1, text.shape[1] + 1)
            pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

            # load
            wav, _ = librosa.load(
                str(os.path.join(args.data, 'wavs', fname + ".wav")))

            spec = librosa.stft(
                y=wav,
                n_fft=cfg['audio']['n_fft'],
                win_length=cfg['audio']['win_length'],
                hop_length=cfg['audio']['hop_length'])
            mag = np.abs(spec)
            mel = librosa.filters.mel(sr=cfg['audio']['sr'],
                                      n_fft=cfg['audio']['n_fft'],
                                      n_mels=cfg['audio']['num_mels'],
                                      fmin=cfg['audio']['fmin'],
                                      fmax=cfg['audio']['fmax'])
            mel = np.matmul(mel, mag)
            mel = np.log(np.maximum(mel, 1e-5))

            mel_input = np.transpose(mel, axes=(1, 0))
            mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0])
            mel_lens = mel_input.shape[1]

            pos_mel = np.arange(1, mel_input.shape[1] + 1)
            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                text, mel_input, pos_text, pos_mel)
            mel_input = fluid.layers.concat(
                [mel_input, postnet_pred[:, -1:, :]], axis=1)

            alignment, _ = get_alignment(attn_probs, mel_lens,
                                         network_cfg['decoder_num_head'])
            alignments[fname] = alignment
        with open(args.output + '.pkl', "wb") as f:
            pickle.dump(alignments, f)
Beispiel #8
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output,
                                    'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    network_cfg = cfg['network']
    model = TransformerTTS(
        network_cfg['embedding_size'], network_cfg['hidden_size'],
        network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
        cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
        network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
                                        (cfg['train']['learning_rate']**2)),
                                   cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
            'grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(
        model=model,
        optimizer=optimizer,
        checkpoint_dir=os.path.join(args.output, 'checkpoints'),
        iteration=args.iteration,
        checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(
        cfg['audio'],
        place,
        args.data,
        cfg['train']['batch_size'],
        nranks,
        local_rank,
        shuffle=True).reader

    iterator = iter(tqdm(reader))

    global_step += 1

    while global_step <= cfg['train']['max_iteration']:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm(reader))
            batch = next(iterator)

        character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch

        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            character, mel_input, pos_text, pos_mel)

        mel_loss = layers.mean(
            layers.abs(layers.elementwise_sub(mel_pred, mel)))
        post_mel_loss = layers.mean(
            layers.abs(layers.elementwise_sub(postnet_pred, mel)))
        loss = mel_loss + post_mel_loss

        stop_loss = cross_entropy(
            stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight'])
        loss = loss + stop_loss

        if local_rank == 0:
            writer.add_scalar('training_loss/mel_loss',
                              mel_loss.numpy(),
                              global_step)
            writer.add_scalar('training_loss/post_mel_loss',
                              post_mel_loss.numpy(),
                              global_step)
            writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)

            if parallel:
                writer.add_scalar('alphas/encoder_alpha',
                                   model._layers.encoder.alpha.numpy(),
                                   global_step)
                writer.add_scalar('alphas/decoder_alpha',
                                   model._layers.decoder.alpha.numpy(),
                                   global_step)
            else:
                writer.add_scalar('alphas/encoder_alpha',
                                   model.encoder.alpha.numpy(),
                                   global_step)
                writer.add_scalar('alphas/decoder_alpha',
                                   model.decoder.alpha.numpy(),
                                   global_step)

            writer.add_scalar('learning_rate',
                              optimizer._learning_rate.step().numpy(),
                              global_step)

            if global_step % cfg['train']['image_interval'] == 1:
                for i, prob in enumerate(attn_probs):
                    for j in range(cfg['network']['decoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_%d_0' % global_step,
                            x,
                            i * 4 + j)

                for i, prob in enumerate(attn_enc):
                    for j in range(cfg['network']['encoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_enc_%d_0' % global_step,
                            x,
                            i * 4 + j)

                for i, prob in enumerate(attn_dec):
                    for j in range(cfg['network']['decoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_dec_%d_0' % global_step,
                            x,
                            i * 4 + j)

        if parallel:
            loss = model.scale_loss(loss)
            loss.backward()
            model.apply_collective_grads()
        else:
            loss.backward()
        optimizer.minimize(loss)
        model.clear_gradients()

        # save checkpoint
        if local_rank == 0 and global_step % cfg['train'][
                'checkpoint_interval'] == 0:
            io.save_parameters(
                os.path.join(args.output, 'checkpoints'), global_step, model,
                optimizer)
        global_step += 1

    if local_rank == 0:
        writer.close()
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output,
                                        'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    network_cfg = cfg['network']
    model = TransformerTTS(
        network_cfg['embedding_size'], network_cfg['hidden_size'],
        network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
        cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
        network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(
            1 / (cfg['train']['warm_up_step'] *
                 (cfg['train']['learning_rate']**2)),
            cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(
            cfg['train']['grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     optimizer=optimizer,
                                     checkpoint_dir=os.path.join(
                                         args.output, 'checkpoints'),
                                     iteration=args.iteration,
                                     checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(cfg['audio'],
                            place,
                            args.data,
                            cfg['train']['batch_size'],
                            nranks,
                            local_rank,
                            shuffle=True).reader()

    for epoch in range(cfg['train']['max_epochs']):
        pbar = tqdm(reader)
        for i, data in enumerate(pbar):
            pbar.set_description('Processing at epoch %d' % epoch)
            character, mel, mel_input, pos_text, pos_mel = data

            global_step += 1

            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                character, mel_input, pos_text, pos_mel)

            mel_loss = layers.mean(
                layers.abs(layers.elementwise_sub(mel_pred, mel)))
            post_mel_loss = layers.mean(
                layers.abs(layers.elementwise_sub(postnet_pred, mel)))
            loss = mel_loss + post_mel_loss

            # Note: When used stop token loss the learning did not work.
            if cfg['network']['stop_token']:
                label = (pos_mel == 0).astype(np.float32)
                stop_loss = cross_entropy(stop_preds, label)
                loss = loss + stop_loss

            if local_rank == 0:
                writer.add_scalars(
                    'training_loss', {
                        'mel_loss': mel_loss.numpy(),
                        'post_mel_loss': post_mel_loss.numpy()
                    }, global_step)

                if cfg['network']['stop_token']:
                    writer.add_scalar('stop_loss', stop_loss.numpy(),
                                      global_step)

                if parallel:
                    writer.add_scalars(
                        'alphas', {
                            'encoder_alpha':
                            model._layers.encoder.alpha.numpy(),
                            'decoder_alpha':
                            model._layers.decoder.alpha.numpy(),
                        }, global_step)
                else:
                    writer.add_scalars(
                        'alphas', {
                            'encoder_alpha': model.encoder.alpha.numpy(),
                            'decoder_alpha': model.decoder.alpha.numpy(),
                        }, global_step)

                writer.add_scalar('learning_rate',
                                  optimizer._learning_rate.step().numpy(),
                                  global_step)

                if global_step % cfg['train']['image_interval'] == 1:
                    for i, prob in enumerate(attn_probs):
                        for j in range(cfg['network']['decoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_%d_0' % global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

                    for i, prob in enumerate(attn_enc):
                        for j in range(cfg['network']['encoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_enc_%d_0' %
                                             global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

                    for i, prob in enumerate(attn_dec):
                        for j in range(cfg['network']['decoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_dec_%d_0' %
                                             global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

            if parallel:
                loss = model.scale_loss(loss)
                loss.backward()
                model.apply_collective_grads()
            else:
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()

            # save checkpoint
            if local_rank == 0 and global_step % cfg['train'][
                    'checkpoint_interval'] == 0:
                io.save_parameters(os.path.join(args.output, 'checkpoints'),
                                   global_step, model, optimizer)

    if local_rank == 0:
        writer.close()
Beispiel #10
0
class TransformerTTS(hub.NLPPredictionModule):
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-120000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")

        tts_config_path = os.path.join(self.directory, "assets", "tts",
                                       "ljspeech.yaml")
        with open(tts_config_path) as f:
            self.tts_config = yaml.load(f, Loader=yaml.Loader)

        # The max length of audio when synthsis.
        self.max_len = 1000
        # The threshold of stop token which indicates the time step should stop generate spectrum or not.
        self.stop_threshold = 0.5

        with fluid.dygraph.guard(fluid.CPUPlace()):
            # Build TTS.
            with fluid.unique_name.guard():
                network_cfg = self.tts_config['network']
                self.tts_model = TransformerTTSModel(
                    network_cfg['embedding_size'], network_cfg['hidden_size'],
                    network_cfg['encoder_num_head'],
                    network_cfg['encoder_n_layers'],
                    self.tts_config['audio']['num_mels'],
                    network_cfg['outputs_per_step'],
                    network_cfg['decoder_num_head'],
                    network_cfg['decoder_n_layers'])
                io.load_parameters(model=self.tts_model,
                                   checkpoint_path=self.tts_checkpoint_path)

            # Build vocoder.
            args = AttrDict()
            args.config = self.waveflow_config_path
            args.use_fp16 = False
            self.waveflow_config = io.add_yaml_config_to_args(args)
            self.waveflow = WaveFlowModule(self.waveflow_config)
            io.load_parameters(model=self.waveflow,
                               checkpoint_path=self.waveflow_checkpoint_path)

    def synthesize(self, texts, use_gpu=False, vocoder="griffin-lim"):
        """
        Get the synthetic wavs from the texts.

        Args:
             texts(list): the input texts to be predicted.
             use_gpu(bool): whether use gpu to predict or not
             vocoder(str): the vocoder name, "griffin-lim" or "waveflow"

        Returns:
             wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
             sample_rate(int): the audio sample rate.
        """
        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
            use_gpu = False
            logger.warning(
                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
            )
        if use_gpu:
            place = fluid.CUDAPlace(0)
        else:
            place = fluid.CPUPlace()

        if texts and isinstance(texts, list):
            predicted_data = texts
        else:
            raise ValueError(
                "The input data is inconsistent with expectations.")

        wavs = []
        with fluid.dygraph.guard(place):
            self.tts_model.eval()
            self.waveflow.eval()
            for text in predicted_data:
                # init input
                logger.info("Processing sentence: %s" % text)
                text = np.asarray(text_to_sequence(text))
                text = fluid.layers.unsqueeze(
                    dg.to_variable(text).astype(np.int64), [0])
                mel_input = dg.to_variable(np.zeros([1, 1,
                                                     80])).astype(np.float32)
                pos_text = np.arange(1, text.shape[1] + 1)
                pos_text = fluid.layers.unsqueeze(
                    dg.to_variable(pos_text).astype(np.int64), [0])

                for i in range(self.max_len):
                    pos_mel = np.arange(1, mel_input.shape[1] + 1)
                    pos_mel = fluid.layers.unsqueeze(
                        dg.to_variable(pos_mel).astype(np.int64), [0])
                    mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = self.tts_model(
                        text, mel_input, pos_text, pos_mel)
                    if stop_preds.numpy()[0, -1] > self.stop_threshold:
                        break
                    mel_input = fluid.layers.concat(
                        [mel_input, postnet_pred[:, -1:, :]], axis=1)
                if vocoder == 'griffin-lim':
                    # synthesis use griffin-lim
                    wav = self.synthesis_with_griffinlim(
                        postnet_pred, self.tts_config['audio'])
                elif vocoder == 'waveflow':
                    # synthesis use waveflow
                    wav = self.synthesis_with_waveflow(
                        postnet_pred, self.waveflow_config.sigma)
                else:
                    raise ValueError(
                        'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
                        % vocoder)
                wavs.append(wav)
        return wavs, self.tts_config['audio']['sr']

    def synthesis_with_griffinlim(self, mel_output, cfg):
        # synthesis with griffin-lim
        mel_output = fluid.layers.transpose(
            fluid.layers.squeeze(mel_output, [0]), [1, 0])
        mel_output = np.exp(mel_output.numpy())
        basis = librosa.filters.mel(cfg['sr'],
                                    cfg['n_fft'],
                                    cfg['num_mels'],
                                    fmin=cfg['fmin'],
                                    fmax=cfg['fmax'])
        inv_basis = np.linalg.pinv(basis)
        spec = np.maximum(1e-10, np.dot(inv_basis, mel_output))

        wav = librosa.core.griffinlim(spec**cfg['power'],
                                      hop_length=cfg['hop_length'],
                                      win_length=cfg['win_length'])

        return wav

    def synthesis_with_waveflow(self, mel_output, sigma):
        mel_spectrogram = fluid.layers.transpose(
            fluid.layers.squeeze(mel_output, [0]), [1, 0])
        mel_spectrogram = fluid.layers.unsqueeze(mel_spectrogram, [0])

        for layer in self.waveflow.sublayers():
            if isinstance(layer, WeightNormWrapper):
                layer.remove_weight_norm()

        # Run model inference.
        wav = self.waveflow.synthesize(mel_spectrogram, sigma=sigma)
        return wav.numpy()[0]

    @serving
    def serving_method(self, texts, use_gpu=False, vocoder="griffin-lim"):
        """
        Run as a service.
        """
        wavs, sample_rate = self.synthesize(texts, use_gpu, vocoder)
        wavs = [wav.tolist() for wav in wavs]
        result = {"wavs": wavs, "sample_rate": sample_rate}
        return result

    def add_module_config_arg(self):
        """
        Add the command config options
        """
        self.arg_config_group.add_argument(
            '--use_gpu',
            type=ast.literal_eval,
            default=False,
            help="whether use GPU for prediction")

        self.arg_config_group.add_argument('--vocoder',
                                           type=str,
                                           default="griffin-lim",
                                           choices=['griffin-lim', 'waveflow'],
                                           help="the vocoder name")

    def add_module_output_arg(self):
        """
        Add the command config options
        """
        self.arg_config_group.add_argument(
            '--output_path',
            type=str,
            default=os.path.abspath(
                os.path.join(os.path.curdir, f"{self.name}_prediction")),
            help="path to save experiment results")

    @runnable
    def run_cmd(self, argvs):
        """
        Run as a command
        """
        self.parser = argparse.ArgumentParser(
            description='Run the %s module.' % self.name,
            prog='hub run %s' % self.name,
            usage='%(prog)s',
            add_help=True)

        self.arg_input_group = self.parser.add_argument_group(
            title="Input options", description="Input data. Required")
        self.arg_input_group = self.parser.add_argument_group(
            title="Ouput options", description="Ouput path. Optional.")
        self.arg_config_group = self.parser.add_argument_group(
            title="Config options",
            description=
            "Run configuration for controlling module behavior, optional.")

        self.add_module_config_arg()
        self.add_module_input_arg()
        self.add_module_output_arg()

        args = self.parser.parse_args(argvs)

        try:
            input_data = self.check_input_data(args)
        except DataFormatError and RuntimeError:
            self.parser.print_help()
            return None

        mkdir(args.output_path)
        wavs, sample_rate = self.synthesize(texts=input_data,
                                            use_gpu=args.use_gpu,
                                            vocoder=args.vocoder)

        for index, wav in enumerate(wavs):
            sf.write(os.path.join(args.output_path, f"{index}.wav"), wav,
                     sample_rate)

        ret = f"The synthesized wav files have been saved in {args.output_path}"
        return ret