def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-162000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_config_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_config_path) as f: self.tts_config = yaml.load(f, Loader=yaml.Loader) with fluid.dygraph.guard(fluid.CPUPlace()): self.tts_model = FastSpeechModel( self.tts_config['network'], num_mels=self.tts_config['audio']['num_mels']) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) # Build vocoder. args = AttrDict() args.config = self.waveflow_config_path args.use_fp16 = False self.waveflow_config = io.add_yaml_config_to_args(args) self.waveflow = WaveFlowModule(self.waveflow_config) io.load_parameters(model=self.waveflow, checkpoint_path=self.waveflow_checkpoint_path)
def build(self, training=True): """Initialize the model. Args: training (bool, optional): Whether the model is built for training or inference. Defaults to True. Returns: None """ config = self.config dataset = LJSpeech(config, self.nranks, self.rank) self.trainloader = dataset.trainloader self.validloader = dataset.validloader waveflow = WaveFlowModule(config) # Dry run once to create and initalize all necessary parameters. audio = dg.to_variable(np.random.randn(1, 16000).astype(self.dtype)) mel = dg.to_variable( np.random.randn(1, config.mel_bands, 63).astype(self.dtype)) waveflow(audio, mel) if training: optimizer = fluid.optimizer.AdamOptimizer( learning_rate=config.learning_rate, parameter_list=waveflow.parameters()) # Load parameters. iteration = io.load_parameters(model=waveflow, optimizer=optimizer, checkpoint_dir=self.checkpoint_dir, iteration=config.iteration, checkpoint_path=config.checkpoint) print("Rank {}: checkpoint loaded.".format(self.rank)) # Data parallelism. if self.parallel: strategy = dg.parallel.prepare_context() waveflow = dg.parallel.DataParallel(waveflow, strategy) self.waveflow = waveflow self.optimizer = optimizer self.criterion = WaveFlowLoss(config.sigma) else: # Load parameters. iteration = io.load_parameters(model=waveflow, checkpoint_dir=self.checkpoint_dir, iteration=config.iteration, checkpoint_path=config.checkpoint) print("Rank {}: checkpoint loaded.".format(self.rank)) for layer in waveflow.sublayers(): if isinstance(layer, weight_norm.WeightNormWrapper): layer.remove_weight_norm() self.waveflow = waveflow return iteration
def __init__(self, config_path, checkpoint_path): with open(config_path, 'rt') as f: config = ruamel.yaml.safe_load(f) ns = argparse.Namespace() for k, v in config.items(): setattr(ns, k, v) ns.use_fp16 = False self.model = WaveFlowModule(ns) io.load_parameters(self.model, checkpoint_path=checkpoint_path)
def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-120000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_config_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_config_path) as f: self.tts_config = yaml.load(f, Loader=yaml.Loader) # The max length of audio when synthsis. self.max_len = 1000 # The threshold of stop token which indicates the time step should stop generate spectrum or not. self.stop_threshold = 0.5 with fluid.dygraph.guard(fluid.CPUPlace()): # Build TTS. with fluid.unique_name.guard(): network_cfg = self.tts_config['network'] self.tts_model = TransformerTTSModel( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], self.tts_config['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) # Build vocoder. args = AttrDict() args.config = self.waveflow_config_path args.use_fp16 = False self.waveflow_config = io.add_yaml_config_to_args(args) self.waveflow = WaveFlowModule(self.waveflow_config) io.load_parameters(model=self.waveflow, checkpoint_path=self.waveflow_checkpoint_path)
def synthesis_with_waveflow(mel_output, args, checkpoint, place): fluid.enable_dygraph(place) args.config = args.config_vocoder args.use_fp16 = False config = io.add_yaml_config_to_args(args) mel_spectrogram = fluid.layers.transpose(mel_output, [0, 2, 1]) # Build model. waveflow = WaveFlowModule(config) io.load_parameters(model=waveflow, checkpoint_path=checkpoint) for layer in waveflow.sublayers(): if isinstance(layer, weight_norm.WeightNormWrapper): layer.remove_weight_norm() # Run model inference. wav = waveflow.synthesize(mel_spectrogram, sigma=config.sigma) return wav.numpy()[0]