Ejemplo n.º 1
0
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-162000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")

        tts_config_path = os.path.join(self.directory, "assets", "tts",
                                       "ljspeech.yaml")
        with open(tts_config_path) as f:
            self.tts_config = yaml.load(f, Loader=yaml.Loader)
        with fluid.dygraph.guard(fluid.CPUPlace()):
            self.tts_model = FastSpeechModel(
                self.tts_config['network'],
                num_mels=self.tts_config['audio']['num_mels'])
            io.load_parameters(model=self.tts_model,
                               checkpoint_path=self.tts_checkpoint_path)

            # Build vocoder.
            args = AttrDict()
            args.config = self.waveflow_config_path
            args.use_fp16 = False
            self.waveflow_config = io.add_yaml_config_to_args(args)
            self.waveflow = WaveFlowModule(self.waveflow_config)
            io.load_parameters(model=self.waveflow,
                               checkpoint_path=self.waveflow_checkpoint_path)
Ejemplo n.º 2
0
    def build(self, training=True):
        """Initialize the model.

        Args:
            training (bool, optional): Whether the model is built for training or inference.
                Defaults to True.

        Returns:
            None
        """
        config = self.config
        dataset = LJSpeech(config, self.nranks, self.rank)
        self.trainloader = dataset.trainloader
        self.validloader = dataset.validloader

        waveflow = WaveFlowModule(config)

        # Dry run once to create and initalize all necessary parameters.
        audio = dg.to_variable(np.random.randn(1, 16000).astype(self.dtype))
        mel = dg.to_variable(
            np.random.randn(1, config.mel_bands, 63).astype(self.dtype))
        waveflow(audio, mel)

        if training:
            optimizer = fluid.optimizer.AdamOptimizer(
                learning_rate=config.learning_rate,
                parameter_list=waveflow.parameters())

            # Load parameters.
            iteration = io.load_parameters(model=waveflow,
                                           optimizer=optimizer,
                                           checkpoint_dir=self.checkpoint_dir,
                                           iteration=config.iteration,
                                           checkpoint_path=config.checkpoint)
            print("Rank {}: checkpoint loaded.".format(self.rank))

            # Data parallelism.
            if self.parallel:
                strategy = dg.parallel.prepare_context()
                waveflow = dg.parallel.DataParallel(waveflow, strategy)

            self.waveflow = waveflow
            self.optimizer = optimizer
            self.criterion = WaveFlowLoss(config.sigma)

        else:
            # Load parameters.
            iteration = io.load_parameters(model=waveflow,
                                           checkpoint_dir=self.checkpoint_dir,
                                           iteration=config.iteration,
                                           checkpoint_path=config.checkpoint)
            print("Rank {}: checkpoint loaded.".format(self.rank))

            for layer in waveflow.sublayers():
                if isinstance(layer, weight_norm.WeightNormWrapper):
                    layer.remove_weight_norm()

            self.waveflow = waveflow

        return iteration
Ejemplo n.º 3
0
    def __init__(self, config_path, checkpoint_path):
        with open(config_path, 'rt') as f:
            config = ruamel.yaml.safe_load(f)
        ns = argparse.Namespace()
        for k, v in config.items():
            setattr(ns, k, v)
        ns.use_fp16 = False

        self.model = WaveFlowModule(ns)
        io.load_parameters(self.model, checkpoint_path=checkpoint_path)
Ejemplo n.º 4
0
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-120000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")

        tts_config_path = os.path.join(self.directory, "assets", "tts",
                                       "ljspeech.yaml")
        with open(tts_config_path) as f:
            self.tts_config = yaml.load(f, Loader=yaml.Loader)

        # The max length of audio when synthsis.
        self.max_len = 1000
        # The threshold of stop token which indicates the time step should stop generate spectrum or not.
        self.stop_threshold = 0.5

        with fluid.dygraph.guard(fluid.CPUPlace()):
            # Build TTS.
            with fluid.unique_name.guard():
                network_cfg = self.tts_config['network']
                self.tts_model = TransformerTTSModel(
                    network_cfg['embedding_size'], network_cfg['hidden_size'],
                    network_cfg['encoder_num_head'],
                    network_cfg['encoder_n_layers'],
                    self.tts_config['audio']['num_mels'],
                    network_cfg['outputs_per_step'],
                    network_cfg['decoder_num_head'],
                    network_cfg['decoder_n_layers'])
                io.load_parameters(model=self.tts_model,
                                   checkpoint_path=self.tts_checkpoint_path)

            # Build vocoder.
            args = AttrDict()
            args.config = self.waveflow_config_path
            args.use_fp16 = False
            self.waveflow_config = io.add_yaml_config_to_args(args)
            self.waveflow = WaveFlowModule(self.waveflow_config)
            io.load_parameters(model=self.waveflow,
                               checkpoint_path=self.waveflow_checkpoint_path)
Ejemplo n.º 5
0
class WaveflowVocoder(object):
    def __init__(self, config_path, checkpoint_path):
        with open(config_path, 'rt') as f:
            config = ruamel.yaml.safe_load(f)
        ns = argparse.Namespace()
        for k, v in config.items():
            setattr(ns, k, v)
        ns.use_fp16 = False

        self.model = WaveFlowModule(ns)
        io.load_parameters(self.model, checkpoint_path=checkpoint_path)

    def __call__(self, mel):
        with dg.no_grad():
            self.model.eval()
            audio = self.model.synthesize(mel)
        self.model.train()
        return audio
Ejemplo n.º 6
0
def synthesis_with_waveflow(mel_output, args, checkpoint, place):

    fluid.enable_dygraph(place)
    args.config = args.config_vocoder
    args.use_fp16 = False
    config = io.add_yaml_config_to_args(args)

    mel_spectrogram = fluid.layers.transpose(mel_output, [0, 2, 1])

    # Build model.
    waveflow = WaveFlowModule(config)
    io.load_parameters(model=waveflow, checkpoint_path=checkpoint)
    for layer in waveflow.sublayers():
        if isinstance(layer, weight_norm.WeightNormWrapper):
            layer.remove_weight_norm()

    # Run model inference.
    wav = waveflow.synthesize(mel_spectrogram, sigma=config.sigma)
    return wav.numpy()[0]
Ejemplo n.º 7
0
class FastSpeech(hub.NLPPredictionModule):
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-162000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")

        tts_config_path = os.path.join(self.directory, "assets", "tts",
                                       "ljspeech.yaml")
        with open(tts_config_path) as f:
            self.tts_config = yaml.load(f, Loader=yaml.Loader)
        with fluid.dygraph.guard(fluid.CPUPlace()):
            self.tts_model = FastSpeechModel(
                self.tts_config['network'],
                num_mels=self.tts_config['audio']['num_mels'])
            io.load_parameters(model=self.tts_model,
                               checkpoint_path=self.tts_checkpoint_path)

            # Build vocoder.
            args = AttrDict()
            args.config = self.waveflow_config_path
            args.use_fp16 = False
            self.waveflow_config = io.add_yaml_config_to_args(args)
            self.waveflow = WaveFlowModule(self.waveflow_config)
            io.load_parameters(model=self.waveflow,
                               checkpoint_path=self.waveflow_checkpoint_path)

    def synthesize(self,
                   texts,
                   use_gpu=False,
                   speed=1.0,
                   vocoder="griffin-lim"):
        """
        Get the synthetic wavs from the texts.

        Args:
             texts(list): the input texts to be predicted.
             use_gpu(bool): whether use gpu to predict or not. Default False.
             speed(float): Controlling the voice speed. Default 1.0.
             vocoder(str): the vocoder name, "griffin-lim" or "waveflow".

        Returns:
             wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
             sample_rate(int): the audio sample rate.
        """
        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
            use_gpu = False
            logger.warning(
                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
            )
        if use_gpu:
            place = fluid.CUDAPlace(0)
        else:
            place = fluid.CPUPlace()

        if texts and isinstance(texts, list):
            predicted_data = texts
        else:
            raise ValueError(
                "The input data is inconsistent with expectations.")

        wavs = []
        with fluid.dygraph.guard(place):
            self.tts_model.eval()
            self.waveflow.eval()
            for text in predicted_data:
                # init input
                logger.info("Processing sentence: %s" % text)
                text = np.asarray(text_to_sequence(text))
                text = np.expand_dims(text, axis=0)
                pos_text = np.arange(1, text.shape[1] + 1)
                pos_text = np.expand_dims(pos_text, axis=0)

                text = dg.to_variable(text).astype(np.int64)
                pos_text = dg.to_variable(pos_text).astype(np.int64)

                _, mel_output_postnet = self.tts_model(text,
                                                       pos_text,
                                                       alpha=1 / speed)

                if vocoder == 'griffin-lim':
                    # synthesis use griffin-lim
                    wav = self.synthesis_with_griffinlim(
                        mel_output_postnet, self.tts_config['audio'])
                elif vocoder == 'waveflow':
                    wav = self.synthesis_with_waveflow(
                        mel_output_postnet, self.waveflow_config.sigma)
                else:
                    raise ValueError(
                        'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
                        % vocoder)
                wavs.append(wav)
        return wavs, self.tts_config['audio']['sr']

    def synthesis_with_griffinlim(self, mel_output, cfg):
        # synthesis with griffin-lim
        mel_output = fluid.layers.transpose(
            fluid.layers.squeeze(mel_output, [0]), [1, 0])
        mel_output = np.exp(mel_output.numpy())
        basis = librosa.filters.mel(cfg['sr'],
                                    cfg['n_fft'],
                                    cfg['num_mels'],
                                    fmin=cfg['fmin'],
                                    fmax=cfg['fmax'])
        inv_basis = np.linalg.pinv(basis)
        spec = np.maximum(1e-10, np.dot(inv_basis, mel_output))

        wav = librosa.core.griffinlim(spec**cfg['power'],
                                      hop_length=cfg['hop_length'],
                                      win_length=cfg['win_length'])

        return wav

    def synthesis_with_waveflow(self, mel_output, sigma):
        mel_spectrogram = fluid.layers.transpose(
            fluid.layers.squeeze(mel_output, [0]), [1, 0])
        mel_spectrogram = fluid.layers.unsqueeze(mel_spectrogram, [0])

        for layer in self.waveflow.sublayers():
            if isinstance(layer, WeightNormWrapper):
                layer.remove_weight_norm()

        # Run model inference.
        wav = self.waveflow.synthesize(mel_spectrogram, sigma=sigma)
        return wav.numpy()[0]

    @serving
    def serving_method(self,
                       texts,
                       use_gpu=False,
                       speed=1.0,
                       vocoder="griffin-lim"):
        """
        Run as a service.
        """
        wavs, sample_rate = self.synthesize(texts, use_gpu, speed, vocoder)
        wavs = [wav.tolist() for wav in wavs]
        result = {"wavs": wavs, "sample_rate": sample_rate}
        return result

    def add_module_config_arg(self):
        """
        Add the command config options
        """
        self.arg_config_group.add_argument(
            '--use_gpu',
            type=ast.literal_eval,
            default=False,
            help="whether use GPU for prediction")

        self.arg_config_group.add_argument('--vocoder',
                                           type=str,
                                           default="griffin-lim",
                                           choices=['griffin-lim', 'waveflow'],
                                           help="the vocoder name")

    def add_module_output_arg(self):
        """
        Add the command config options
        """
        self.arg_config_group.add_argument(
            '--output_path',
            type=str,
            default=os.path.abspath(
                os.path.join(os.path.curdir, f"{self.name}_prediction")),
            help="path to save experiment results")

    @runnable
    def run_cmd(self, argvs):
        """
        Run as a command
        """
        self.parser = argparse.ArgumentParser(
            description='Run the %s module.' % self.name,
            prog='hub run %s' % self.name,
            usage='%(prog)s',
            add_help=True)

        self.arg_input_group = self.parser.add_argument_group(
            title="Input options", description="Input data. Required")
        self.arg_input_group = self.parser.add_argument_group(
            title="Ouput options", description="Ouput path. Optional.")
        self.arg_config_group = self.parser.add_argument_group(
            title="Config options",
            description=
            "Run configuration for controlling module behavior, optional.")

        self.add_module_config_arg()
        self.add_module_input_arg()
        self.add_module_output_arg()

        args = self.parser.parse_args(argvs)

        try:
            input_data = self.check_input_data(args)
        except DataFormatError and RuntimeError:
            self.parser.print_help()
            return None

        mkdir(args.output_path)
        wavs, sample_rate = self.synthesize(texts=input_data,
                                            use_gpu=args.use_gpu,
                                            vocoder=args.vocoder)

        for index, wav in enumerate(wavs):
            sf.write(os.path.join(args.output_path, f"{index}.wav"), wav,
                     sample_rate)

        ret = f"The synthesized wav files have been saved in {args.output_path}"
        return ret