def test_fastspeech_resize_positional_embeddings(new_size): config = FastSpeechConfig() fastspeech = TFFastSpeech(config, name="fastspeech") fastspeech._build() fastspeech.save_weights("./test.h5") fastspeech.resize_positional_embeddings(new_size) fastspeech.load_weights("./test.h5", by_name=True, skip_mismatch=True)
def test_fastspeech_trainable(num_hidden_layers, n_speakers): config = FastSpeechConfig(num_hidden_layers=num_hidden_layers, n_speakers=n_speakers) fastspeech = TFFastSpeech(config, name='fastspeech') optimizer = tf.keras.optimizers.Adam(lr=0.001) # fake inputs input_ids = tf.convert_to_tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]], tf.int32) attention_mask = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.int32) speaker_ids = tf.convert_to_tensor([0], tf.int32) duration_gts = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.int32) mel_gts = tf.random.uniform(shape=[1, 10, 80], dtype=tf.float32) @tf.function def one_step_training(): with tf.GradientTape() as tape: mel_outputs_before, _, duration_outputs = fastspeech( input_ids, attention_mask, speaker_ids, duration_gts, training=True) duration_loss = tf.keras.losses.MeanSquaredError()(duration_gts, duration_outputs) mel_loss = tf.keras.losses.MeanSquaredError()(mel_gts, mel_outputs_before) loss = duration_loss + mel_loss gradients = tape.gradient(loss, fastspeech.trainable_variables) optimizer.apply_gradients(zip(gradients, fastspeech.trainable_variables)) tf.print(loss) import time for i in range(2): if i == 1: start = time.time() one_step_training() print(time.time() - start)
def get_model(): with open( get_weight_path('fastspeech_config.yml') ) as f: config = yaml.load(f, Loader=yaml.Loader) config = FastSpeechConfig(**config['fastspeech_params']) fastspeech = TFFastSpeech(config=config, name='fastspeech') fastspeech._build() fastspeech.load_weights( get_weight_path('fastspeech-150k.h5') ) return fastspeech
def main(): """Run training process.""" parser = argparse.ArgumentParser( description="Train FastSpeech (See detail in tensorflow_tts/bin/train-fastspeech.py)" ) parser.add_argument( "--train-dir", default=None, type=str, help="directory including training data. ", ) parser.add_argument( "--dev-dir", default=None, type=str, help="directory including development data. ", ) parser.add_argument( "--use-norm", default=1, type=int, help="usr norm-mels for train or raw." ) parser.add_argument( "--outdir", type=str, required=True, help="directory to save checkpoints." ) parser.add_argument( "--config", type=str, required=True, help="yaml format configuration file." ) parser.add_argument( "--resume", default="", type=str, nargs="?", help='checkpoint file path to resume training. (default="")', ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) parser.add_argument( "--mixed_precision", default=0, type=int, help="using mixed precision for generator or not.", ) args = parser.parse_args() # set mixed precision config if args.mixed_precision == 1: tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True}) args.mixed_precision = bool(args.mixed_precision) args.use_norm = bool(args.use_norm) # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # check arguments if args.train_dir is None: raise ValueError("Please specify --train-dir") if args.dev_dir is None: raise ValueError("Please specify --valid-dir") # load and save config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) config["version"] = tensorflow_tts.__version__ with open(os.path.join(args.outdir, "config.yml"), "w") as f: yaml.dump(config, f, Dumper=yaml.Dumper) for key, value in config.items(): logging.info(f"{key} = {value}") # get dataset if config["remove_short_samples"]: mel_length_threshold = config["mel_length_threshold"] else: mel_length_threshold = None if config["format"] == "npy": charactor_query = "*-ids.npy" mel_query = "*-raw-feats.npy" if args.use_norm is False else "*-norm-feats.npy" duration_query = "*-durations.npy" charactor_load_fn = np.load mel_load_fn = np.load duration_load_fn = np.load else: raise ValueError("Only npy are supported.") # define train/valid dataset train_dataset = CharactorDurationMelDataset( root_dir=args.train_dir, charactor_query=charactor_query, mel_query=mel_query, duration_query=duration_query, charactor_load_fn=charactor_load_fn, mel_load_fn=mel_load_fn, duration_load_fn=duration_load_fn, mel_length_threshold=mel_length_threshold, return_utt_id=False, ).create( is_shuffle=config["is_shuffle"], allow_cache=config["allow_cache"], batch_size=config["batch_size"], ) valid_dataset = CharactorDurationMelDataset( root_dir=args.dev_dir, charactor_query=charactor_query, mel_query=mel_query, duration_query=duration_query, charactor_load_fn=charactor_load_fn, mel_load_fn=mel_load_fn, duration_load_fn=duration_load_fn, mel_length_threshold=None, return_utt_id=False, ).create( is_shuffle=config["is_shuffle"], allow_cache=config["allow_cache"], batch_size=config["batch_size"], ) fastspeech = TFFastSpeech( config=FASTSPEECH_CONFIG.FastSpeechConfig(**config["fastspeech_params"]) ) fastspeech._build() fastspeech.summary() # define trainer trainer = FastSpeechTrainer( config=config, steps=0, epochs=0, is_mixed_precision=False ) # AdamW for fastspeech learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=config["optimizer_params"]["initial_learning_rate"], decay_steps=config["optimizer_params"]["decay_steps"], end_learning_rate=config["optimizer_params"]["end_learning_rate"], ) learning_rate_fn = WarmUp( initial_learning_rate=config["optimizer_params"]["initial_learning_rate"], decay_schedule_fn=learning_rate_fn, warmup_steps=int( config["train_max_steps"] * config["optimizer_params"]["warmup_proportion"] ), ) optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=config["optimizer_params"]["weight_decay"], beta_1=0.9, beta_2=0.98, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) # compile trainer trainer.compile(model=fastspeech, optimizer=optimizer) # start training try: trainer.fit( train_dataset, valid_dataset, saved_path=os.path.join(config["outdir"], "checkpoints/"), resume=args.resume, ) except KeyboardInterrupt: trainer.save_checkpoint() logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")
def main(): """Run fastspeech decoding from folder.""" parser = argparse.ArgumentParser( description= "Decode soft-mel features from charactor with trained FastSpeech " "(See detail in examples/fastspeech/decode_fastspeech.py).") parser.add_argument( "--rootdir", default=None, type=str, required=True, help="directory including ids/durations files.", ) parser.add_argument("--outdir", type=str, required=True, help="directory to save generated speech.") parser.add_argument("--checkpoint", type=str, required=True, help="checkpoint file to be loaded.") parser.add_argument( "--config", default=None, type=str, required=True, help="yaml format configuration file. if not explicitly provided, " "it will be searched in the checkpoint directory. (default=None)", ) parser.add_argument( "--batch-size", default=8, type=int, required=False, help="Batch size for inference.", ) parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)", ) args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) if config["format"] == "npy": char_query = "*-ids.npy" char_load_fn = np.load else: raise ValueError("Only npy is supported.") # define data-loader dataset = CharactorDataset( root_dir=args.rootdir, charactor_query=char_query, charactor_load_fn=char_load_fn, ) dataset = dataset.create(batch_size=args.batch_size) # define model and load checkpoint fastspeech = TFFastSpeech( config=FastSpeechConfig(**config["fastspeech_params"]), name="fastspeech") fastspeech._build() fastspeech.load_weights(args.checkpoint) for data in tqdm(dataset, desc="Decoding"): utt_ids = data["utt_ids"] char_ids = data["input_ids"] # fastspeech inference. masked_mel_before, masked_mel_after, duration_outputs = fastspeech.inference( char_ids, speaker_ids=tf.zeros(shape=[tf.shape(char_ids)[0]], dtype=tf.int32), speed_ratios=tf.ones(shape=[tf.shape(char_ids)[0]], dtype=tf.float32), ) # convert to numpy masked_mel_befores = masked_mel_before.numpy() masked_mel_afters = masked_mel_after.numpy() for (utt_id, mel_before, mel_after, durations) in zip(utt_ids, masked_mel_befores, masked_mel_afters, duration_outputs): # real len of mel predicted real_length = durations.numpy().sum() utt_id = utt_id.numpy().decode("utf-8") # save to folder. np.save( os.path.join(args.outdir, f"{utt_id}-fs-before-feats.npy"), mel_before[:real_length, :].astype(np.float32), allow_pickle=False, ) np.save( os.path.join(args.outdir, f"{utt_id}-fs-after-feats.npy"), mel_after[:real_length, :].astype(np.float32), allow_pickle=False, )