def test_fastspeech2_train_some_layers(var_train_expr, config_path): config = FastSpeech2Config(n_speakers=5) model = TFFastSpeech2(config) model._build() optimizer = tf.keras.optimizers.Adam(lr=0.001) with open(config_path) as f: config = yaml.load(f, Loader=yaml.Loader) config.update({"outdir": "./"}) config.update({"var_train_expr": var_train_expr}) STRATEGY = return_strategy() trainer = FastSpeech2Trainer( config=config, strategy=STRATEGY, steps=0, epochs=0, is_mixed_precision=False, ) trainer.compile(model, optimizer) len_trainable_vars = len(trainer._trainable_variables) all_trainable_vars = len(model.trainable_variables) if var_train_expr is None: tf.debugging.assert_equal(len_trainable_vars, all_trainable_vars) else: tf.debugging.assert_less(len_trainable_vars, all_trainable_vars)
def test_fastspeech_trainable(num_hidden_layers, n_speakers): config = FastSpeech2Config( encoder_num_hidden_layers=num_hidden_layers, decoder_num_hidden_layers=num_hidden_layers + 1, n_speakers=n_speakers, ) fastspeech2 = TFFastSpeech2(config, name="fastspeech") optimizer = tf.keras.optimizers.Adam(lr=0.001) # fake inputs input_ids = tf.convert_to_tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]], tf.int32) attention_mask = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.int32) speaker_ids = tf.convert_to_tensor([0], tf.int32) duration_gts = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.int32) f0_gts = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.float32) energy_gts = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.float32) mel_gts = tf.random.uniform(shape=[1, 10, 80], dtype=tf.float32) @tf.function def one_step_training(): with tf.GradientTape() as tape: mel_outputs_before, _, duration_outputs, _, _ = fastspeech2( input_ids, speaker_ids, duration_gts, f0_gts, energy_gts, training=True, ) duration_loss = tf.keras.losses.MeanSquaredError()( duration_gts, duration_outputs) mel_loss = tf.keras.losses.MeanSquaredError()(mel_gts, mel_outputs_before) loss = duration_loss + mel_loss gradients = tape.gradient(loss, fastspeech2.trainable_variables) optimizer.apply_gradients( zip(gradients, fastspeech2.trainable_variables)) tf.print(loss) import time for i in range(2): if i == 1: start = time.time() one_step_training() print(time.time() - start)
def main(): """Run training process.""" parser = argparse.ArgumentParser( description="Train FastSpeech (See detail in tensorflow_tts/bin/train-fastspeech.py)" ) parser.add_argument( "--train-dir", default=None, type=str, help="directory including training data. ", ) parser.add_argument( "--dev-dir", default=None, type=str, help="directory including development data. ", ) parser.add_argument( "--use-norm", default=1, type=int, help="usr norm-mels for train or raw." ) parser.add_argument( "--f0-stat", default="./dump/stats_f0.npy", type=str, required=True, help="f0-stat path.", ) parser.add_argument( "--energy-stat", default="./dump/stats_energy.npy", type=str, required=True, help="energy-stat path.", ) parser.add_argument( "--outdir", type=str, required=True, help="directory to save checkpoints." ) parser.add_argument( "--config", type=str, required=True, help="yaml format configuration file." ) parser.add_argument( "--resume", default="", type=str, nargs="?", help='checkpoint file path to resume training. (default="")', ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) parser.add_argument( "--mixed_precision", default=0, type=int, help="using mixed precision for generator or not.", ) parser.add_argument( "--pretrained", default="", type=str, nargs="?", help="pretrained weights .h5 file to load weights from. Auto-skips non-matching layers", ) args = parser.parse_args() # return strategy STRATEGY = return_strategy() # set mixed precision config if args.mixed_precision == 1: tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True}) args.mixed_precision = bool(args.mixed_precision) args.use_norm = bool(args.use_norm) # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # check arguments if args.train_dir is None: raise ValueError("Please specify --train-dir") if args.dev_dir is None: raise ValueError("Please specify --valid-dir") # load and save config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) config["version"] = tensorflow_tts.__version__ with open(os.path.join(args.outdir, "config.yml"), "w") as f: yaml.dump(config, f, Dumper=yaml.Dumper) for key, value in config.items(): logging.info(f"{key} = {value}") # get dataset if config["remove_short_samples"]: mel_length_threshold = config["mel_length_threshold"] else: mel_length_threshold = None if config["format"] == "npy": charactor_query = "*-ids.npy" mel_query = "*-raw-feats.npy" if args.use_norm is False else "*-norm-feats.npy" duration_query = "*-durations.npy" f0_query = "*-raw-f0.npy" energy_query = "*-raw-energy.npy" else: raise ValueError("Only npy are supported.") # define train/valid dataset train_dataset = CharactorDurationF0EnergyMelDataset( root_dir=args.train_dir, charactor_query=charactor_query, mel_query=mel_query, duration_query=duration_query, f0_query=f0_query, energy_query=energy_query, f0_stat=args.f0_stat, energy_stat=args.energy_stat, mel_length_threshold=mel_length_threshold, ).create( is_shuffle=config["is_shuffle"], allow_cache=config["allow_cache"], batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync * config["gradient_accumulation_steps"], ) valid_dataset = CharactorDurationF0EnergyMelDataset( root_dir=args.dev_dir, charactor_query=charactor_query, mel_query=mel_query, duration_query=duration_query, f0_query=f0_query, energy_query=energy_query, f0_stat=args.f0_stat, energy_stat=args.energy_stat, mel_length_threshold=mel_length_threshold, ).create( is_shuffle=config["is_shuffle"], allow_cache=config["allow_cache"], batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync, ) # define trainer trainer = FastSpeech2Trainer( config=config, strategy=STRATEGY, steps=0, epochs=0, is_mixed_precision=args.mixed_precision, ) with STRATEGY.scope(): # define model fastspeech = TFFastSpeech2( config=FastSpeech2Config(**config["fastspeech2_params"]) ) fastspeech._build() fastspeech.summary() if len(args.pretrained) > 1: fastspeech.load_weights(args.pretrained, by_name=True, skip_mismatch=True) logging.info( f"Successfully loaded pretrained weight from {args.pretrained}." ) # AdamW for fastspeech learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=config["optimizer_params"]["initial_learning_rate"], decay_steps=config["optimizer_params"]["decay_steps"], end_learning_rate=config["optimizer_params"]["end_learning_rate"], ) learning_rate_fn = WarmUp( initial_learning_rate=config["optimizer_params"]["initial_learning_rate"], decay_schedule_fn=learning_rate_fn, warmup_steps=int( config["train_max_steps"] * config["optimizer_params"]["warmup_proportion"] ), ) optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=config["optimizer_params"]["weight_decay"], beta_1=0.9, beta_2=0.98, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) _ = optimizer.iterations # compile trainer trainer.compile(model=fastspeech, optimizer=optimizer) # start training try: trainer.fit( train_dataset, valid_dataset, saved_path=os.path.join(config["outdir"], "checkpoints/"), resume=args.resume, ) except KeyboardInterrupt: trainer.save_checkpoint() logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")
def main(): """Run fastspeech2 decoding from folder.""" parser = argparse.ArgumentParser( description= "Decode soft-mel features from charactor with trained FastSpeech " "(See detail in examples/fastspeech2/decode_fastspeech2.py).") parser.add_argument( "--rootdir", default=None, type=str, required=True, help="directory including ids/durations files.", ) parser.add_argument("--outdir", type=str, required=True, help="directory to save generated speech.") parser.add_argument("--checkpoint", type=str, required=True, help="checkpoint file to be loaded.") parser.add_argument( "--config", default=None, type=str, required=True, help="yaml format configuration file. if not explicitly provided, " "it will be searched in the checkpoint directory. (default=None)", ) parser.add_argument( "--batch-size", default=8, type=int, required=False, help="Batch size for inference.", ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) if config["format"] == "npy": char_query = "*-ids.npy" char_load_fn = np.load else: raise ValueError("Only npy is supported.") # define data-loader dataset = CharactorDataset( root_dir=args.rootdir, charactor_query=char_query, charactor_load_fn=char_load_fn, return_utt_id=True, ) dataset = dataset.create(batch_size=args.batch_size) # define model and load checkpoint fastspeech2 = TFFastSpeech2( config=FastSpeech2Config(**config["fastspeech_params"]), name="fastspeech2") fastspeech2._build() fastspeech2.load_weights(args.checkpoint) for data in tqdm(dataset, desc="Decoding"): utt_ids = data[0] char_ids = data[1] # fastspeech inference. ( masked_mel_before, masked_mel_after, duration_outputs, _, _, ) = fastspeech2.inference( char_ids, attention_mask=tf.math.not_equal(char_ids, 0), speaker_ids=tf.zeros(shape=[tf.shape(char_ids)[0]], dtype=tf.int32), speed_ratios=tf.ones(shape=[tf.shape(char_ids)[0]], dtype=tf.float32), f0_ratios=tf.ones(shape=[tf.shape(char_ids)[0]], dtype=tf.float32), energy_ratios=tf.ones(shape=[tf.shape(char_ids)[0]], dtype=tf.float32), ) # convert to numpy masked_mel_befores = masked_mel_before.numpy() masked_mel_afters = masked_mel_after.numpy() for (utt_id, mel_before, mel_after, durations) in zip(utt_ids, masked_mel_befores, masked_mel_afters, duration_outputs): # real len of mel predicted real_length = durations.numpy().sum() utt_id = utt_id.numpy().decode("utf-8") # save to folder. np.save( os.path.join(args.outdir, f"{utt_id}-fs-before-feats.npy"), mel_before[:real_length, :].astype(np.float32), allow_pickle=False, ) np.save( os.path.join(args.outdir, f"{utt_id}-fs-after-feats.npy"), mel_after[:real_length, :].astype(np.float32), allow_pickle=False, )
def main(): """Run fastspeech2 decoding from folder.""" parser = argparse.ArgumentParser( description= "Decode soft-mel features from charactor with trained FastSpeech " "(See detail in examples/fastspeech2/decode_fastspeech2.py).") parser.add_argument( "--rootdir", default=None, type=str, required=True, help="directory including ids/durations files.", ) parser.add_argument("--outdir", type=str, required=True, help="directory to save generated speech.") parser.add_argument("--checkpoint", type=str, required=True, help="checkpoint file to be loaded.") parser.add_argument( "--config", default=None, type=str, required=True, help="yaml format configuration file. if not explicitly provided, " "it will be searched in the checkpoint directory. (default=None)", ) parser.add_argument( "--batch-size", default=8, type=int, required=False, help="Batch size for inference.", ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load config outdpost = os.path.join(args.outdir, "postnets") if not os.path.exists(outdpost): os.makedirs(outdpost) with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) if config["format"] == "npy": char_query = "*-ids.npy" char_load_fn = np.load else: raise ValueError("Only npy is supported.") # define data-loader dataset = CharactorDurationF0EnergyMelDataset( root_dir=args.rootdir, charactor_query=char_query, charactor_load_fn=char_load_fn, ) dataset = dataset.create( batch_size=1 ) # force batch size to 1 otherwise it may miss certain files # define model and load checkpoint fastspeech2 = TFFastSpeech2( config=FastSpeech2Config(**config["fastspeech2_params"]), name="fastspeech2") fastspeech2._build() fastspeech2.load_weights(args.checkpoint) fastspeech2 = tf.function(fastspeech2, experimental_relax_shapes=True) for data in tqdm(dataset, desc="Decoding"): utt_ids = data["utt_ids"] char_ids = data["input_ids"] mel_lens = data["mel_lengths"] # fastspeech inference. masked_mel_before, masked_mel_after, duration_outputs, _, _ = fastspeech2( **data, training=True) # convert to numpy masked_mel_befores = masked_mel_before.numpy() masked_mel_afters = masked_mel_after.numpy() for (utt_id, mel_before, mel_after, durations, mel_len) in zip(utt_ids, masked_mel_befores, masked_mel_afters, duration_outputs, mel_lens): # real len of mel predicted real_length = np.around(durations.numpy().sum()).astype(int) utt_id = utt_id.numpy().decode("utf-8") np.save( os.path.join(outdpost, f"{utt_id}-postnet.npy"), mel_after[:mel_len, :].astype(np.float32), allow_pickle=False, )