def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-162000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_config_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_config_path) as f: self.tts_config = yaml.load(f, Loader=yaml.Loader) with fluid.dygraph.guard(fluid.CPUPlace()): self.tts_model = FastSpeechModel( self.tts_config['network'], num_mels=self.tts_config['audio']['num_mels']) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) # Build vocoder. args = AttrDict() args.config = self.waveflow_config_path args.use_fp16 = False self.waveflow_config = io.add_yaml_config_to_args(args) self.waveflow = WaveFlowModule(self.waveflow_config) io.load_parameters(model=self.waveflow, checkpoint_path=self.waveflow_checkpoint_path)
def build(self, training=True): """Initialize the model. Args: training (bool, optional): Whether the model is built for training or inference. Defaults to True. Returns: None """ config = self.config dataset = LJSpeech(config, self.nranks, self.rank) self.trainloader = dataset.trainloader self.validloader = dataset.validloader waveflow = WaveFlowModule(config) # Dry run once to create and initalize all necessary parameters. audio = dg.to_variable(np.random.randn(1, 16000).astype(self.dtype)) mel = dg.to_variable( np.random.randn(1, config.mel_bands, 63).astype(self.dtype)) waveflow(audio, mel) if training: optimizer = fluid.optimizer.AdamOptimizer( learning_rate=config.learning_rate, parameter_list=waveflow.parameters()) # Load parameters. iteration = io.load_parameters(model=waveflow, optimizer=optimizer, checkpoint_dir=self.checkpoint_dir, iteration=config.iteration, checkpoint_path=config.checkpoint) print("Rank {}: checkpoint loaded.".format(self.rank)) # Data parallelism. if self.parallel: strategy = dg.parallel.prepare_context() waveflow = dg.parallel.DataParallel(waveflow, strategy) self.waveflow = waveflow self.optimizer = optimizer self.criterion = WaveFlowLoss(config.sigma) else: # Load parameters. iteration = io.load_parameters(model=waveflow, checkpoint_dir=self.checkpoint_dir, iteration=config.iteration, checkpoint_path=config.checkpoint) print("Rank {}: checkpoint loaded.".format(self.rank)) for layer in waveflow.sublayers(): if isinstance(layer, weight_norm.WeightNormWrapper): layer.remove_weight_norm() self.waveflow = waveflow return iteration
def __init__(self, config_path, checkpoint_path): with open(config_path, 'rt') as f: config = ruamel.yaml.safe_load(f) ns = argparse.Namespace() for k, v in config.items(): setattr(ns, k, v) ns.use_fp16 = False self.model = WaveFlowModule(ns) io.load_parameters(self.model, checkpoint_path=checkpoint_path)
def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-120000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_config_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_config_path) as f: self.tts_config = yaml.load(f, Loader=yaml.Loader) # The max length of audio when synthsis. self.max_len = 1000 # The threshold of stop token which indicates the time step should stop generate spectrum or not. self.stop_threshold = 0.5 with fluid.dygraph.guard(fluid.CPUPlace()): # Build TTS. with fluid.unique_name.guard(): network_cfg = self.tts_config['network'] self.tts_model = TransformerTTSModel( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], self.tts_config['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) # Build vocoder. args = AttrDict() args.config = self.waveflow_config_path args.use_fp16 = False self.waveflow_config = io.add_yaml_config_to_args(args) self.waveflow = WaveFlowModule(self.waveflow_config) io.load_parameters(model=self.waveflow, checkpoint_path=self.waveflow_checkpoint_path)
class WaveflowVocoder(object): def __init__(self, config_path, checkpoint_path): with open(config_path, 'rt') as f: config = ruamel.yaml.safe_load(f) ns = argparse.Namespace() for k, v in config.items(): setattr(ns, k, v) ns.use_fp16 = False self.model = WaveFlowModule(ns) io.load_parameters(self.model, checkpoint_path=checkpoint_path) def __call__(self, mel): with dg.no_grad(): self.model.eval() audio = self.model.synthesize(mel) self.model.train() return audio
def synthesis_with_waveflow(mel_output, args, checkpoint, place): fluid.enable_dygraph(place) args.config = args.config_vocoder args.use_fp16 = False config = io.add_yaml_config_to_args(args) mel_spectrogram = fluid.layers.transpose(mel_output, [0, 2, 1]) # Build model. waveflow = WaveFlowModule(config) io.load_parameters(model=waveflow, checkpoint_path=checkpoint) for layer in waveflow.sublayers(): if isinstance(layer, weight_norm.WeightNormWrapper): layer.remove_weight_norm() # Run model inference. wav = waveflow.synthesize(mel_spectrogram, sigma=config.sigma) return wav.numpy()[0]
class FastSpeech(hub.NLPPredictionModule): def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-162000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_config_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_config_path) as f: self.tts_config = yaml.load(f, Loader=yaml.Loader) with fluid.dygraph.guard(fluid.CPUPlace()): self.tts_model = FastSpeechModel( self.tts_config['network'], num_mels=self.tts_config['audio']['num_mels']) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) # Build vocoder. args = AttrDict() args.config = self.waveflow_config_path args.use_fp16 = False self.waveflow_config = io.add_yaml_config_to_args(args) self.waveflow = WaveFlowModule(self.waveflow_config) io.load_parameters(model=self.waveflow, checkpoint_path=self.waveflow_checkpoint_path) def synthesize(self, texts, use_gpu=False, speed=1.0, vocoder="griffin-lim"): """ Get the synthetic wavs from the texts. Args: texts(list): the input texts to be predicted. use_gpu(bool): whether use gpu to predict or not. Default False. speed(float): Controlling the voice speed. Default 1.0. vocoder(str): the vocoder name, "griffin-lim" or "waveflow". Returns: wavs(str): the audio wav with sample rate . You can use soundfile.write to save it. sample_rate(int): the audio sample rate. """ if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ: use_gpu = False logger.warning( "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True" ) if use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() if texts and isinstance(texts, list): predicted_data = texts else: raise ValueError( "The input data is inconsistent with expectations.") wavs = [] with fluid.dygraph.guard(place): self.tts_model.eval() self.waveflow.eval() for text in predicted_data: # init input logger.info("Processing sentence: %s" % text) text = np.asarray(text_to_sequence(text)) text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) text = dg.to_variable(text).astype(np.int64) pos_text = dg.to_variable(pos_text).astype(np.int64) _, mel_output_postnet = self.tts_model(text, pos_text, alpha=1 / speed) if vocoder == 'griffin-lim': # synthesis use griffin-lim wav = self.synthesis_with_griffinlim( mel_output_postnet, self.tts_config['audio']) elif vocoder == 'waveflow': wav = self.synthesis_with_waveflow( mel_output_postnet, self.waveflow_config.sigma) else: raise ValueError( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % vocoder) wavs.append(wav) return wavs, self.tts_config['audio']['sr'] def synthesis_with_griffinlim(self, mel_output, cfg): # synthesis with griffin-lim mel_output = fluid.layers.transpose( fluid.layers.squeeze(mel_output, [0]), [1, 0]) mel_output = np.exp(mel_output.numpy()) basis = librosa.filters.mel(cfg['sr'], cfg['n_fft'], cfg['num_mels'], fmin=cfg['fmin'], fmax=cfg['fmax']) inv_basis = np.linalg.pinv(basis) spec = np.maximum(1e-10, np.dot(inv_basis, mel_output)) wav = librosa.core.griffinlim(spec**cfg['power'], hop_length=cfg['hop_length'], win_length=cfg['win_length']) return wav def synthesis_with_waveflow(self, mel_output, sigma): mel_spectrogram = fluid.layers.transpose( fluid.layers.squeeze(mel_output, [0]), [1, 0]) mel_spectrogram = fluid.layers.unsqueeze(mel_spectrogram, [0]) for layer in self.waveflow.sublayers(): if isinstance(layer, WeightNormWrapper): layer.remove_weight_norm() # Run model inference. wav = self.waveflow.synthesize(mel_spectrogram, sigma=sigma) return wav.numpy()[0] @serving def serving_method(self, texts, use_gpu=False, speed=1.0, vocoder="griffin-lim"): """ Run as a service. """ wavs, sample_rate = self.synthesize(texts, use_gpu, speed, vocoder) wavs = [wav.tolist() for wav in wavs] result = {"wavs": wavs, "sample_rate": sample_rate} return result def add_module_config_arg(self): """ Add the command config options """ self.arg_config_group.add_argument( '--use_gpu', type=ast.literal_eval, default=False, help="whether use GPU for prediction") self.arg_config_group.add_argument('--vocoder', type=str, default="griffin-lim", choices=['griffin-lim', 'waveflow'], help="the vocoder name") def add_module_output_arg(self): """ Add the command config options """ self.arg_config_group.add_argument( '--output_path', type=str, default=os.path.abspath( os.path.join(os.path.curdir, f"{self.name}_prediction")), help="path to save experiment results") @runnable def run_cmd(self, argvs): """ Run as a command """ self.parser = argparse.ArgumentParser( description='Run the %s module.' % self.name, prog='hub run %s' % self.name, usage='%(prog)s', add_help=True) self.arg_input_group = self.parser.add_argument_group( title="Input options", description="Input data. Required") self.arg_input_group = self.parser.add_argument_group( title="Ouput options", description="Ouput path. Optional.") self.arg_config_group = self.parser.add_argument_group( title="Config options", description= "Run configuration for controlling module behavior, optional.") self.add_module_config_arg() self.add_module_input_arg() self.add_module_output_arg() args = self.parser.parse_args(argvs) try: input_data = self.check_input_data(args) except DataFormatError and RuntimeError: self.parser.print_help() return None mkdir(args.output_path) wavs, sample_rate = self.synthesize(texts=input_data, use_gpu=args.use_gpu, vocoder=args.vocoder) for index, wav in enumerate(wavs): sf.write(os.path.join(args.output_path, f"{index}.wav"), wav, sample_rate) ret = f"The synthesized wav files have been saved in {args.output_path}" return ret