def test_fastspeech2_train_some_layers(var_train_expr, config_path): config = FastSpeech2Config(n_speakers=5) model = TFFastSpeech2(config) model._build() optimizer = tf.keras.optimizers.Adam(lr=0.001) with open(config_path) as f: config = yaml.load(f, Loader=yaml.Loader) config.update({"outdir": "./"}) config.update({"var_train_expr": var_train_expr}) STRATEGY = return_strategy() trainer = FastSpeech2Trainer( config=config, strategy=STRATEGY, steps=0, epochs=0, is_mixed_precision=False, ) trainer.compile(model, optimizer) len_trainable_vars = len(trainer._trainable_variables) all_trainable_vars = len(model.trainable_variables) if var_train_expr is None: tf.debugging.assert_equal(len_trainable_vars, all_trainable_vars) else: tf.debugging.assert_less(len_trainable_vars, all_trainable_vars)
def test_fastspeech_trainable(num_hidden_layers, n_speakers): config = FastSpeech2Config( encoder_num_hidden_layers=num_hidden_layers, decoder_num_hidden_layers=num_hidden_layers + 1, n_speakers=n_speakers, ) fastspeech2 = TFFastSpeech2(config, name="fastspeech") optimizer = tf.keras.optimizers.Adam(lr=0.001) # fake inputs input_ids = tf.convert_to_tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]], tf.int32) attention_mask = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.int32) speaker_ids = tf.convert_to_tensor([0], tf.int32) duration_gts = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.int32) f0_gts = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.float32) energy_gts = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.float32) mel_gts = tf.random.uniform(shape=[1, 10, 80], dtype=tf.float32) @tf.function def one_step_training(): with tf.GradientTape() as tape: mel_outputs_before, _, duration_outputs, _, _ = fastspeech2( input_ids, speaker_ids, duration_gts, f0_gts, energy_gts, training=True, ) duration_loss = tf.keras.losses.MeanSquaredError()( duration_gts, duration_outputs) mel_loss = tf.keras.losses.MeanSquaredError()(mel_gts, mel_outputs_before) loss = duration_loss + mel_loss gradients = tape.gradient(loss, fastspeech2.trainable_variables) optimizer.apply_gradients( zip(gradients, fastspeech2.trainable_variables)) tf.print(loss) import time for i in range(2): if i == 1: start = time.time() one_step_training() print(time.time() - start)
def _load_fastspeech2(self, path='./model_files/fastspeech2'): config = os.path.join(path, 'config.yml') with open(config) as f: config = yaml.load(f, Loader=yaml.Loader) config = FastSpeech2Config(**config["fastspeech_params"]) fastspeech2 = TFFastSpeech2(config=config, name="fastspeech2v1", enable_tflite_convertible=True) fastspeech2._build() weights = os.path.join(path, 'model-150000.h5') fastspeech2.load_weights(weights) print(fastspeech2.summary()) return fastspeech2
def main(): """Run training process.""" parser = argparse.ArgumentParser( description="Train FastSpeech (See detail in tensorflow_tts/bin/train-fastspeech.py)" ) parser.add_argument( "--train-dir", default=None, type=str, help="directory including training data. ", ) parser.add_argument( "--dev-dir", default=None, type=str, help="directory including development data. ", ) parser.add_argument( "--use-norm", default=1, type=int, help="usr norm-mels for train or raw." ) parser.add_argument( "--f0-stat", default="./dump/stats_f0.npy", type=str, required=True, help="f0-stat path.", ) parser.add_argument( "--energy-stat", default="./dump/stats_energy.npy", type=str, required=True, help="energy-stat path.", ) parser.add_argument( "--outdir", type=str, required=True, help="directory to save checkpoints." ) parser.add_argument( "--config", type=str, required=True, help="yaml format configuration file." ) parser.add_argument( "--resume", default="", type=str, nargs="?", help='checkpoint file path to resume training. (default="")', ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) parser.add_argument( "--mixed_precision", default=0, type=int, help="using mixed precision for generator or not.", ) parser.add_argument( "--pretrained", default="", type=str, nargs="?", help="pretrained weights .h5 file to load weights from. Auto-skips non-matching layers", ) args = parser.parse_args() # return strategy STRATEGY = return_strategy() # set mixed precision config if args.mixed_precision == 1: tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True}) args.mixed_precision = bool(args.mixed_precision) args.use_norm = bool(args.use_norm) # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # check arguments if args.train_dir is None: raise ValueError("Please specify --train-dir") if args.dev_dir is None: raise ValueError("Please specify --valid-dir") # load and save config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) config["version"] = tensorflow_tts.__version__ with open(os.path.join(args.outdir, "config.yml"), "w") as f: yaml.dump(config, f, Dumper=yaml.Dumper) for key, value in config.items(): logging.info(f"{key} = {value}") # get dataset if config["remove_short_samples"]: mel_length_threshold = config["mel_length_threshold"] else: mel_length_threshold = None if config["format"] == "npy": charactor_query = "*-ids.npy" mel_query = "*-raw-feats.npy" if args.use_norm is False else "*-norm-feats.npy" duration_query = "*-durations.npy" f0_query = "*-raw-f0.npy" energy_query = "*-raw-energy.npy" else: raise ValueError("Only npy are supported.") # define train/valid dataset train_dataset = CharactorDurationF0EnergyMelDataset( root_dir=args.train_dir, charactor_query=charactor_query, mel_query=mel_query, duration_query=duration_query, f0_query=f0_query, energy_query=energy_query, f0_stat=args.f0_stat, energy_stat=args.energy_stat, mel_length_threshold=mel_length_threshold, ).create( is_shuffle=config["is_shuffle"], allow_cache=config["allow_cache"], batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync * config["gradient_accumulation_steps"], ) valid_dataset = CharactorDurationF0EnergyMelDataset( root_dir=args.dev_dir, charactor_query=charactor_query, mel_query=mel_query, duration_query=duration_query, f0_query=f0_query, energy_query=energy_query, f0_stat=args.f0_stat, energy_stat=args.energy_stat, mel_length_threshold=mel_length_threshold, ).create( is_shuffle=config["is_shuffle"], allow_cache=config["allow_cache"], batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync, ) # define trainer trainer = FastSpeech2Trainer( config=config, strategy=STRATEGY, steps=0, epochs=0, is_mixed_precision=args.mixed_precision, ) with STRATEGY.scope(): # define model fastspeech = TFFastSpeech2( config=FastSpeech2Config(**config["fastspeech2_params"]) ) fastspeech._build() fastspeech.summary() if len(args.pretrained) > 1: fastspeech.load_weights(args.pretrained, by_name=True, skip_mismatch=True) logging.info( f"Successfully loaded pretrained weight from {args.pretrained}." ) # AdamW for fastspeech learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=config["optimizer_params"]["initial_learning_rate"], decay_steps=config["optimizer_params"]["decay_steps"], end_learning_rate=config["optimizer_params"]["end_learning_rate"], ) learning_rate_fn = WarmUp( initial_learning_rate=config["optimizer_params"]["initial_learning_rate"], decay_schedule_fn=learning_rate_fn, warmup_steps=int( config["train_max_steps"] * config["optimizer_params"]["warmup_proportion"] ), ) optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=config["optimizer_params"]["weight_decay"], beta_1=0.9, beta_2=0.98, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) _ = optimizer.iterations # compile trainer trainer.compile(model=fastspeech, optimizer=optimizer) # start training try: trainer.fit( train_dataset, valid_dataset, saved_path=os.path.join(config["outdir"], "checkpoints/"), resume=args.resume, ) except KeyboardInterrupt: trainer.save_checkpoint() logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")
def main(): """Run fastspeech2 decoding from folder.""" parser = argparse.ArgumentParser( description= "Decode soft-mel features from charactor with trained FastSpeech " "(See detail in examples/fastspeech2/decode_fastspeech2.py).") parser.add_argument( "--rootdir", default=None, type=str, required=True, help="directory including ids/durations files.", ) parser.add_argument("--outdir", type=str, required=True, help="directory to save generated speech.") parser.add_argument("--checkpoint", type=str, required=True, help="checkpoint file to be loaded.") parser.add_argument( "--config", default=None, type=str, required=True, help="yaml format configuration file. if not explicitly provided, " "it will be searched in the checkpoint directory. (default=None)", ) parser.add_argument( "--batch-size", default=8, type=int, required=False, help="Batch size for inference.", ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) if config["format"] == "npy": char_query = "*-ids.npy" char_load_fn = np.load else: raise ValueError("Only npy is supported.") # define data-loader dataset = CharactorDataset( root_dir=args.rootdir, charactor_query=char_query, charactor_load_fn=char_load_fn, return_utt_id=True, ) dataset = dataset.create(batch_size=args.batch_size) # define model and load checkpoint fastspeech2 = TFFastSpeech2( config=FastSpeech2Config(**config["fastspeech_params"]), name="fastspeech2") fastspeech2._build() fastspeech2.load_weights(args.checkpoint) for data in tqdm(dataset, desc="Decoding"): utt_ids = data[0] char_ids = data[1] # fastspeech inference. ( masked_mel_before, masked_mel_after, duration_outputs, _, _, ) = fastspeech2.inference( char_ids, attention_mask=tf.math.not_equal(char_ids, 0), speaker_ids=tf.zeros(shape=[tf.shape(char_ids)[0]], dtype=tf.int32), speed_ratios=tf.ones(shape=[tf.shape(char_ids)[0]], dtype=tf.float32), f0_ratios=tf.ones(shape=[tf.shape(char_ids)[0]], dtype=tf.float32), energy_ratios=tf.ones(shape=[tf.shape(char_ids)[0]], dtype=tf.float32), ) # convert to numpy masked_mel_befores = masked_mel_before.numpy() masked_mel_afters = masked_mel_after.numpy() for (utt_id, mel_before, mel_after, durations) in zip(utt_ids, masked_mel_befores, masked_mel_afters, duration_outputs): # real len of mel predicted real_length = durations.numpy().sum() utt_id = utt_id.numpy().decode("utf-8") # save to folder. np.save( os.path.join(args.outdir, f"{utt_id}-fs-before-feats.npy"), mel_before[:real_length, :].astype(np.float32), allow_pickle=False, ) np.save( os.path.join(args.outdir, f"{utt_id}-fs-after-feats.npy"), mel_after[:real_length, :].astype(np.float32), allow_pickle=False, )
def main(): """Run fastspeech2 decoding from folder.""" parser = argparse.ArgumentParser( description= "Decode soft-mel features from charactor with trained FastSpeech " "(See detail in examples/fastspeech2/decode_fastspeech2.py).") parser.add_argument( "--rootdir", default=None, type=str, required=True, help="directory including ids/durations files.", ) parser.add_argument("--outdir", type=str, required=True, help="directory to save generated speech.") parser.add_argument("--checkpoint", type=str, required=True, help="checkpoint file to be loaded.") parser.add_argument( "--config", default=None, type=str, required=True, help="yaml format configuration file. if not explicitly provided, " "it will be searched in the checkpoint directory. (default=None)", ) parser.add_argument( "--batch-size", default=8, type=int, required=False, help="Batch size for inference.", ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load config outdpost = os.path.join(args.outdir, "postnets") if not os.path.exists(outdpost): os.makedirs(outdpost) with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) if config["format"] == "npy": char_query = "*-ids.npy" char_load_fn = np.load else: raise ValueError("Only npy is supported.") # define data-loader dataset = CharactorDurationF0EnergyMelDataset( root_dir=args.rootdir, charactor_query=char_query, charactor_load_fn=char_load_fn, ) dataset = dataset.create( batch_size=1 ) # force batch size to 1 otherwise it may miss certain files # define model and load checkpoint fastspeech2 = TFFastSpeech2( config=FastSpeech2Config(**config["fastspeech2_params"]), name="fastspeech2") fastspeech2._build() fastspeech2.load_weights(args.checkpoint) fastspeech2 = tf.function(fastspeech2, experimental_relax_shapes=True) for data in tqdm(dataset, desc="Decoding"): utt_ids = data["utt_ids"] char_ids = data["input_ids"] mel_lens = data["mel_lengths"] # fastspeech inference. masked_mel_before, masked_mel_after, duration_outputs, _, _ = fastspeech2( **data, training=True) # convert to numpy masked_mel_befores = masked_mel_before.numpy() masked_mel_afters = masked_mel_after.numpy() for (utt_id, mel_before, mel_after, durations, mel_len) in zip(utt_ids, masked_mel_befores, masked_mel_afters, duration_outputs, mel_lens): # real len of mel predicted real_length = np.around(durations.numpy().sum()).astype(int) utt_id = utt_id.numpy().decode("utf-8") np.save( os.path.join(outdpost, f"{utt_id}-postnet.npy"), mel_after[:mel_len, :].astype(np.float32), allow_pickle=False, )
def main(): """Run training process.""" parser = argparse.ArgumentParser(description="Train Tacotron2") parser.add_argument("--outdir", type=str, required=True, help="directory to save checkpoints.") parser.add_argument("--rootdir", type=str, required=True, help="dataset directory root") parser.add_argument( "--resume", default="", type=str, nargs="?", help='checkpoint file path to resume training. (default="")') parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") parser.add_argument("--batch-size", default=16, type=int, help="batch size.") parser.add_argument("--mixed_precision", default=0, type=int, help="using mixed precision for generator or not.") parser.add_argument( "--pretrained", default="", type=str, nargs="?", help= 'pretrained weights .h5 file to load weights from. Auto-skips non-matching layers', ) args = parser.parse_args() if args.resume is not None and os.path.isdir(args.resume): args.resume = tf.train.latest_checkpoint(args.resume) # set mixed precision config if args.mixed_precision == 1: tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": True}) args.mixed_precision = bool(args.mixed_precision) # set logger log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 1: logging.basicConfig(level=logging.DEBUG, stream=sys.stdout, format=log_format) elif args.verbose > 0: logging.basicConfig(level=logging.INFO, stream=sys.stdout, format=log_format) else: logging.basicConfig(level=logging.WARN, stream=sys.stdout, format=log_format) logging.warning("Skip DEBUG/INFO messages") # check directory existence(checkpoint) if not os.path.exists(args.outdir): os.makedirs(args.outdir) # select processor Processor = JSpeechProcessor # for test class Generator(Processor.Generator): def __init__(self): super().__init__() self._scaler_energy = StandardScaler(copy=False) self._scaler_f0 = StandardScaler(copy=False) self._energy_stat = np.stack((0, 0)) self._f0_stat = np.stack((0, 0)) def __call__(self, rootdir, tid, seq, speaker): tid, seq, feat_path, speaker = super().__call__( rootdir, tid, seq, speaker) f0_path = os.path.join(rootdir, "f0", f"{tid}.f0") energy_path = os.path.join(rootdir, "energies", f"{tid}.e") duration_path = os.path.join(rootdir, "durations", f"{tid}.dur") with open(f0_path) as f: f0 = np.fromfile(f, dtype='float32') self._scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1)) with open(energy_path) as f: energy = np.fromfile(f, dtype='float32') self._scaler_energy.partial_fit(energy[energy != 0].reshape( -1, 1)) return tid, seq, feat_path, f0_path, energy_path, duration_path, speaker def complete(self): self._f0_stat = np.stack( (self._scaler_f0.mean_, self._scaler_f0.scale_)) self._energy_stat = np.stack( (self._scaler_energy.mean_, self._scaler_energy.scale_)) print("energy stat: mean {}, scale {}".format( self._energy_stat[0], self._energy_stat[1])) print("f0 stat: mean {}, scale {}".format(self._f0_stat[0], self._f0_stat[1])) def energy_stat(self): return self._energy_stat def f0_stat(self): return self._f0_stat generator = Generator() processor = Processor(rootdir=args.rootdir, generator=generator) config = Config(args.outdir, args.batch_size, processor.vocab_size()) # split train and test train_split, valid_split = train_test_split(processor.items, test_size=config.test_size, random_state=42, shuffle=True) train_dataset = generate_datasets(train_split, config, generator.f0_stat(), generator.energy_stat()) valid_dataset = generate_datasets(valid_split, config, generator.f0_stat(), generator.energy_stat()) # define trainer trainer = FastSpeech2Trainer(config=config, strategy=STRATEGY, steps=0, epochs=0, is_mixed_precision=args.mixed_precision) with STRATEGY.scope(): # define model fastspeech = TFFastSpeech2(config=config) # build fastspeech._build() fastspeech.summary() if len(args.pretrained) > 1: fastspeech.load_weights(args.pretrained, by_name=True, skip_mismatch=True) logging.info( f"Successfully loaded pretrained weight from {args.pretrained}." ) # AdamW for fastspeech learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=config.initial_learning_rate, decay_steps=config.decay_steps, end_learning_rate=config.end_learning_rate, ) learning_rate_fn = WarmUp( initial_learning_rate=config.initial_learning_rate, decay_schedule_fn=learning_rate_fn, warmup_steps=int(config.train_max_steps * config.warmup_proportion)) optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=config.weight_decay, beta_1=0.9, beta_2=0.98, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) _ = optimizer.iterations # compile trainer trainer.compile(model=fastspeech, optimizer=optimizer) # start training try: trainer.fit(train_dataset, valid_dataset, saved_path=os.path.join(config.outdir, "checkpoints/"), resume=args.resume) except KeyboardInterrupt: trainer.save_checkpoint() logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")
def main(): parser = argparse.ArgumentParser(description="Dump FastSpeech2") parser.add_argument("--outdir", default="./", type=str, help="directory to save pb or tflite file.") parser.add_argument("--checkpoint", type=str, required=True, help="checkpoint file to be loaded.") parser.add_argument("--vocab_size", type=int, required=True, help="vocab size") parser.add_argument("--tflite", type=bool, default=False, help="saved model to tflite") args = parser.parse_args() # check directory existence(checkpoint) if not os.path.exists(args.outdir): os.makedirs(args.outdir) if args.checkpoint is not None and os.path.isdir(args.checkpoint): args.checkpoint = tf.train.latest_checkpoint(args.checkpoint) save_name = os.path.splitext(os.path.basename(args.checkpoint))[0] config = Config(args.outdir, args.vocab_size) # define model. fastspeech2 = TFFastSpeech2(config=config, name="fastspeech2", enable_tflite_convertible=args.tflite) #build if args.tflite is True: print("dump tflite => vocab_size: {}".format(args.vocab_size)) input_ids = tf.convert_to_tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]], tf.int32) speaker_ids = tf.convert_to_tensor([0], tf.int32) duration_gts = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.int32) f0_gts = tf.convert_to_tensor( [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]], tf.float32) energy_gts = tf.convert_to_tensor( [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]], tf.float32) fastspeech2(input_ids, speaker_ids, duration_gts, f0_gts, energy_gts) fastspeech2.load_weights(args.checkpoint) fastspeech2.summary() fastspeech2_concrete_function = fastspeech2.inference_tflite.get_concrete_function( ) converter = tf.lite.TFLiteConverter.from_concrete_functions( [fastspeech2_concrete_function]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] tflite_model = converter.convert() with open(os.path.join(args.outdir, "{}.tflite".format(save_name)), 'wb') as f: f.write(tflite_model) else: print("dump => vocab_size: {}".format(args.vocab_size)) # tensorflow-gpu==2.3.0 bug to load_weight after call inference fastspeech2.inference( input_ids=tf.expand_dims(tf.convert_to_tensor([1], dtype=tf.int32), 0), speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32), speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32)) fastspeech2.load_weights(args.checkpoint) tf.saved_model.save(fastspeech2, os.path.join(args.outdir, save_name), signatures=fastspeech2.inference)