Beispiel #1
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
    fluid.enable_dygraph(place)

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     checkpoint_path=args.checkpoint)
    model.eval()

    text = np.asarray(text_to_sequence(text_input))
    text = np.expand_dims(text, axis=0)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = np.expand_dims(pos_text, axis=0)

    text = dg.to_variable(text)
    pos_text = dg.to_variable(pos_text)

    _, mel_output_postnet = model(text, pos_text, alpha=args.alpha)

    result = np.exp(mel_output_postnet.numpy())
    mel_output_postnet = fluid.layers.transpose(
        fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
    mel_output_postnet = np.exp(mel_output_postnet.numpy())
    basis = librosa.filters.mel(cfg['audio']['sr'], cfg['audio']['n_fft'],
                                cfg['audio']['num_mels'])
    inv_basis = np.linalg.pinv(basis)
    spec = np.maximum(1e-10, np.dot(inv_basis, mel_output_postnet))

    # synthesis use clarinet
    wav_clarinet = synthesis_with_clarinet(args.config_clarinet,
                                           args.checkpoint_clarinet, result,
                                           place)
    writer.add_audio(text_input + '(clarinet)', wav_clarinet, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(os.path.join(os.path.join(args.output, 'samples'), 'clarinet.wav'),
          cfg['audio']['sr'], wav_clarinet)

    #synthesis use griffin-lim
    wav = librosa.core.griffinlim(spec**cfg['audio']['power'],
                                  hop_length=cfg['audio']['hop_length'],
                                  win_length=cfg['audio']['win_length'])
    writer.add_audio(text_input + '(griffin-lim)', wav, 0, cfg['audio']['sr'])
    write(
        os.path.join(os.path.join(args.output, 'samples'), 'grinffin-lim.wav'),
        cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Beispiel #2
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
    fluid.enable_dygraph(place)

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    # Load parameters.
    global_step = io.load_parameters(
        model=model, checkpoint_path=args.checkpoint)
    model.eval()

    text = np.asarray(text_to_sequence(text_input))
    text = np.expand_dims(text, axis=0)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = np.expand_dims(pos_text, axis=0)

    text = dg.to_variable(text).astype(np.int64)
    pos_text = dg.to_variable(pos_text).astype(np.int64)

    _, mel_output_postnet = model(text, pos_text, alpha=args.alpha)

    if args.vocoder == 'griffin-lim':
        #synthesis use griffin-lim
        wav = synthesis_with_griffinlim(mel_output_postnet, cfg['audio'])
    elif args.vocoder == 'waveflow':
        wav = synthesis_with_waveflow(mel_output_postnet, args,
                                      args.checkpoint_vocoder, place)
    else:
        print(
            'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
            % args.vocoder)

    writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(
        os.path.join(
            os.path.join(args.output, 'samples'), args.vocoder + '.wav'),
        cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Beispiel #3
0
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-162000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")

        tts_config_path = os.path.join(self.directory, "assets", "tts",
                                       "ljspeech.yaml")
        with open(tts_config_path) as f:
            self.tts_config = yaml.load(f, Loader=yaml.Loader)
        with fluid.dygraph.guard(fluid.CPUPlace()):
            self.tts_model = FastSpeechModel(
                self.tts_config['network'],
                num_mels=self.tts_config['audio']['num_mels'])
            io.load_parameters(model=self.tts_model,
                               checkpoint_path=self.tts_checkpoint_path)

            # Build vocoder.
            args = AttrDict()
            args.config = self.waveflow_config_path
            args.use_fp16 = False
            self.waveflow_config = io.add_yaml_config_to_args(args)
            self.waveflow = WaveFlowModule(self.waveflow_config)
            io.load_parameters(model=self.waveflow,
                               checkpoint_path=self.waveflow_checkpoint_path)
Beispiel #4
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(dg.parallel.Env()
                            .dev_id) if args.use_gpu else fluid.CPUPlace()
    fluid.enable_dygraph(place)

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output,
                                        'log')) if local_rank == 0 else None

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
                                        (cfg['train']['learning_rate']**2)),
                                   cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
            'grad_clip_thresh']))
    reader = LJSpeechLoader(
        cfg['audio'],
        place,
        args.data,
        args.alignments_path,
        cfg['train']['batch_size'],
        nranks,
        local_rank,
        shuffle=True).reader
    iterator = iter(tqdm(reader))

    # Load parameters.
    global_step = io.load_parameters(
        model=model,
        optimizer=optimizer,
        checkpoint_dir=os.path.join(args.output, 'checkpoints'),
        iteration=args.iteration,
        checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    while global_step <= cfg['train']['max_iteration']:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm(reader))
            batch = next(iterator)

        (character, mel, pos_text, pos_mel, alignment) = batch

        global_step += 1

        #Forward
        result = model(
            character, pos_text, mel_pos=pos_mel, length_target=alignment)
        mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
        mel_loss = layers.mse_loss(mel_output, mel)
        mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
        duration_loss = layers.mean(
            layers.abs(
                layers.elementwise_sub(duration_predictor_output, alignment)))
        total_loss = mel_loss + mel_postnet_loss + duration_loss

        if local_rank == 0:
            writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
            writer.add_scalar('post_mel_loss',
                              mel_postnet_loss.numpy(), global_step)
            writer.add_scalar('duration_loss',
                              duration_loss.numpy(), global_step)
            writer.add_scalar('learning_rate',
                              optimizer._learning_rate.step().numpy(),
                              global_step)

        if parallel:
            total_loss = model.scale_loss(total_loss)
            total_loss.backward()
            model.apply_collective_grads()
        else:
            total_loss.backward()
        optimizer.minimize(total_loss)
        model.clear_gradients()

        # save checkpoint
        if local_rank == 0 and global_step % cfg['train'][
                'checkpoint_interval'] == 0:
            io.save_parameters(
                os.path.join(args.output, 'checkpoints'), global_step, model,
                optimizer)

    if local_rank == 0:
        writer.close()
Beispiel #5
0
class FastSpeech(hub.NLPPredictionModule):
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-162000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")

        tts_config_path = os.path.join(self.directory, "assets", "tts",
                                       "ljspeech.yaml")
        with open(tts_config_path) as f:
            self.tts_config = yaml.load(f, Loader=yaml.Loader)
        with fluid.dygraph.guard(fluid.CPUPlace()):
            self.tts_model = FastSpeechModel(
                self.tts_config['network'],
                num_mels=self.tts_config['audio']['num_mels'])
            io.load_parameters(model=self.tts_model,
                               checkpoint_path=self.tts_checkpoint_path)

            # Build vocoder.
            args = AttrDict()
            args.config = self.waveflow_config_path
            args.use_fp16 = False
            self.waveflow_config = io.add_yaml_config_to_args(args)
            self.waveflow = WaveFlowModule(self.waveflow_config)
            io.load_parameters(model=self.waveflow,
                               checkpoint_path=self.waveflow_checkpoint_path)

    def synthesize(self,
                   texts,
                   use_gpu=False,
                   speed=1.0,
                   vocoder="griffin-lim"):
        """
        Get the synthetic wavs from the texts.

        Args:
             texts(list): the input texts to be predicted.
             use_gpu(bool): whether use gpu to predict or not. Default False.
             speed(float): Controlling the voice speed. Default 1.0.
             vocoder(str): the vocoder name, "griffin-lim" or "waveflow".

        Returns:
             wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
             sample_rate(int): the audio sample rate.
        """
        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
            use_gpu = False
            logger.warning(
                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
            )
        if use_gpu:
            place = fluid.CUDAPlace(0)
        else:
            place = fluid.CPUPlace()

        if texts and isinstance(texts, list):
            predicted_data = texts
        else:
            raise ValueError(
                "The input data is inconsistent with expectations.")

        wavs = []
        with fluid.dygraph.guard(place):
            self.tts_model.eval()
            self.waveflow.eval()
            for text in predicted_data:
                # init input
                logger.info("Processing sentence: %s" % text)
                text = np.asarray(text_to_sequence(text))
                text = np.expand_dims(text, axis=0)
                pos_text = np.arange(1, text.shape[1] + 1)
                pos_text = np.expand_dims(pos_text, axis=0)

                text = dg.to_variable(text).astype(np.int64)
                pos_text = dg.to_variable(pos_text).astype(np.int64)

                _, mel_output_postnet = self.tts_model(text,
                                                       pos_text,
                                                       alpha=1 / speed)

                if vocoder == 'griffin-lim':
                    # synthesis use griffin-lim
                    wav = self.synthesis_with_griffinlim(
                        mel_output_postnet, self.tts_config['audio'])
                elif vocoder == 'waveflow':
                    wav = self.synthesis_with_waveflow(
                        mel_output_postnet, self.waveflow_config.sigma)
                else:
                    raise ValueError(
                        'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
                        % vocoder)
                wavs.append(wav)
        return wavs, self.tts_config['audio']['sr']

    def synthesis_with_griffinlim(self, mel_output, cfg):
        # synthesis with griffin-lim
        mel_output = fluid.layers.transpose(
            fluid.layers.squeeze(mel_output, [0]), [1, 0])
        mel_output = np.exp(mel_output.numpy())
        basis = librosa.filters.mel(cfg['sr'],
                                    cfg['n_fft'],
                                    cfg['num_mels'],
                                    fmin=cfg['fmin'],
                                    fmax=cfg['fmax'])
        inv_basis = np.linalg.pinv(basis)
        spec = np.maximum(1e-10, np.dot(inv_basis, mel_output))

        wav = librosa.core.griffinlim(spec**cfg['power'],
                                      hop_length=cfg['hop_length'],
                                      win_length=cfg['win_length'])

        return wav

    def synthesis_with_waveflow(self, mel_output, sigma):
        mel_spectrogram = fluid.layers.transpose(
            fluid.layers.squeeze(mel_output, [0]), [1, 0])
        mel_spectrogram = fluid.layers.unsqueeze(mel_spectrogram, [0])

        for layer in self.waveflow.sublayers():
            if isinstance(layer, WeightNormWrapper):
                layer.remove_weight_norm()

        # Run model inference.
        wav = self.waveflow.synthesize(mel_spectrogram, sigma=sigma)
        return wav.numpy()[0]

    @serving
    def serving_method(self,
                       texts,
                       use_gpu=False,
                       speed=1.0,
                       vocoder="griffin-lim"):
        """
        Run as a service.
        """
        wavs, sample_rate = self.synthesize(texts, use_gpu, speed, vocoder)
        wavs = [wav.tolist() for wav in wavs]
        result = {"wavs": wavs, "sample_rate": sample_rate}
        return result

    def add_module_config_arg(self):
        """
        Add the command config options
        """
        self.arg_config_group.add_argument(
            '--use_gpu',
            type=ast.literal_eval,
            default=False,
            help="whether use GPU for prediction")

        self.arg_config_group.add_argument('--vocoder',
                                           type=str,
                                           default="griffin-lim",
                                           choices=['griffin-lim', 'waveflow'],
                                           help="the vocoder name")

    def add_module_output_arg(self):
        """
        Add the command config options
        """
        self.arg_config_group.add_argument(
            '--output_path',
            type=str,
            default=os.path.abspath(
                os.path.join(os.path.curdir, f"{self.name}_prediction")),
            help="path to save experiment results")

    @runnable
    def run_cmd(self, argvs):
        """
        Run as a command
        """
        self.parser = argparse.ArgumentParser(
            description='Run the %s module.' % self.name,
            prog='hub run %s' % self.name,
            usage='%(prog)s',
            add_help=True)

        self.arg_input_group = self.parser.add_argument_group(
            title="Input options", description="Input data. Required")
        self.arg_input_group = self.parser.add_argument_group(
            title="Ouput options", description="Ouput path. Optional.")
        self.arg_config_group = self.parser.add_argument_group(
            title="Config options",
            description=
            "Run configuration for controlling module behavior, optional.")

        self.add_module_config_arg()
        self.add_module_input_arg()
        self.add_module_output_arg()

        args = self.parser.parse_args(argvs)

        try:
            input_data = self.check_input_data(args)
        except DataFormatError and RuntimeError:
            self.parser.print_help()
            return None

        mkdir(args.output_path)
        wavs, sample_rate = self.synthesize(texts=input_data,
                                            use_gpu=args.use_gpu,
                                            vocoder=args.vocoder)

        for index, wav in enumerate(wavs):
            sf.write(os.path.join(args.output_path, f"{index}.wav"), wav,
                     sample_rate)

        ret = f"The synthesized wav files have been saved in {args.output_path}"
        return ret
Beispiel #6
0
def main(args):
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1

    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
             if args.use_data_parallel else fluid.CUDAPlace(0)
             if args.use_gpu else fluid.CPUPlace())

    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
    path = os.path.join(args.log_dir, 'fastspeech')

    writer = SummaryWriter(path) if local_rank == 0 else None

    with dg.guard(place):
        with fluid.unique_name.guard():
            transformer_tts = TransformerTTS(cfg)
            model_dict, _ = load_checkpoint(
                str(args.transformer_step),
                os.path.join(args.transtts_path, "transformer"))
            transformer_tts.set_dict(model_dict)
            transformer_tts.eval()

        model = FastSpeech(cfg)
        model.train()
        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=dg.NoamDecay(1 / (
                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
            parameter_list=model.parameters())
        reader = LJSpeechLoader(
            cfg, args, nranks, local_rank, shuffle=True).reader()

        if args.checkpoint_path is not None:
            model_dict, opti_dict = load_checkpoint(
                str(args.fastspeech_step),
                os.path.join(args.checkpoint_path, "fastspeech"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.fastspeech_step
            print("load checkpoint!!!")

        if args.use_data_parallel:
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)

        for epoch in range(args.epochs):
            pbar = tqdm(reader)

            for i, data in enumerate(pbar):
                pbar.set_description('Processing at epoch %d' % epoch)
                (character, mel, mel_input, pos_text, pos_mel, text_length,
                 mel_lens, enc_slf_mask, enc_query_mask, dec_slf_mask,
                 enc_dec_mask, dec_query_slf_mask, dec_query_mask) = data

                _, _, attn_probs, _, _, _ = transformer_tts(
                    character,
                    mel_input,
                    pos_text,
                    pos_mel,
                    dec_slf_mask=dec_slf_mask,
                    enc_slf_mask=enc_slf_mask,
                    enc_query_mask=enc_query_mask,
                    enc_dec_mask=enc_dec_mask,
                    dec_query_slf_mask=dec_query_slf_mask,
                    dec_query_mask=dec_query_mask)
                alignment, max_attn = get_alignment(attn_probs, mel_lens,
                                                    cfg['transformer_head'])
                alignment = dg.to_variable(alignment).astype(np.float32)

                if local_rank == 0 and global_step % 5 == 1:
                    x = np.uint8(
                        cm.viridis(max_attn[8, :mel_lens.numpy()[8]]) * 255)
                    writer.add_image(
                        'Attention_%d_0' % global_step,
                        x,
                        0,
                        dataformats="HWC")

                global_step += 1

                #Forward
                result = model(
                    character,
                    pos_text,
                    mel_pos=pos_mel,
                    length_target=alignment,
                    enc_non_pad_mask=enc_query_mask,
                    enc_slf_attn_mask=enc_slf_mask,
                    dec_non_pad_mask=dec_query_slf_mask,
                    dec_slf_attn_mask=dec_slf_mask)
                mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
                mel_loss = layers.mse_loss(mel_output, mel)
                mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
                duration_loss = layers.mean(
                    layers.abs(
                        layers.elementwise_sub(duration_predictor_output,
                                               alignment)))
                total_loss = mel_loss + mel_postnet_loss + duration_loss

                if local_rank == 0:
                    writer.add_scalar('mel_loss',
                                      mel_loss.numpy(), global_step)
                    writer.add_scalar('post_mel_loss',
                                      mel_postnet_loss.numpy(), global_step)
                    writer.add_scalar('duration_loss',
                                      duration_loss.numpy(), global_step)
                    writer.add_scalar('learning_rate',
                                      optimizer._learning_rate.step().numpy(),
                                      global_step)

                if args.use_data_parallel:
                    total_loss = model.scale_loss(total_loss)
                    total_loss.backward()
                    model.apply_collective_grads()
                else:
                    total_loss.backward()
                optimizer.minimize(
                    total_loss,
                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
                        'grad_clip_thresh']))
                model.clear_gradients()

                # save checkpoint
                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
                    save_path = os.path.join(args.save_path,
                                             'fastspeech/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
        if local_rank == 0:
            writer.close()
Beispiel #7
0
def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())

    # tensorboard
    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
    path = os.path.join(args.log_dir, 'synthesis')

    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    writer = SummaryWriter(path)

    with dg.guard(place):
        model = FastSpeech(cfg)
        model.set_dict(
            load_checkpoint(
                str(args.fastspeech_step),
                os.path.join(args.checkpoint_path, "fastspeech")))
        model.eval()

        text = np.asarray(text_to_sequence(text_input))
        text = np.expand_dims(text, axis=0)
        pos_text = np.arange(1, text.shape[1] + 1)
        pos_text = np.expand_dims(pos_text, axis=0)
        enc_non_pad_mask = get_non_pad_mask(pos_text).astype(np.float32)
        enc_slf_attn_mask = get_attn_key_pad_mask(pos_text,
                                                  text).astype(np.float32)

        text = dg.to_variable(text)
        pos_text = dg.to_variable(pos_text)
        enc_non_pad_mask = dg.to_variable(enc_non_pad_mask)
        enc_slf_attn_mask = dg.to_variable(enc_slf_attn_mask)

        mel_output, mel_output_postnet = model(
            text,
            pos_text,
            alpha=args.alpha,
            enc_non_pad_mask=enc_non_pad_mask,
            enc_slf_attn_mask=enc_slf_attn_mask,
            dec_non_pad_mask=None,
            dec_slf_attn_mask=None)

        _ljspeech_processor = audio.AudioProcessor(
            sample_rate=cfg['audio']['sr'],
            num_mels=cfg['audio']['num_mels'],
            min_level_db=cfg['audio']['min_level_db'],
            ref_level_db=cfg['audio']['ref_level_db'],
            n_fft=cfg['audio']['n_fft'],
            win_length=cfg['audio']['win_length'],
            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
            symmetric_norm=False,
            max_norm=1.,
            mel_fmin=0,
            mel_fmax=None,
            clip_norm=True,
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)

        mel_output_postnet = fluid.layers.transpose(
            fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
        ))
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        print("Synthesis completed !!!")
    writer.close()