Example #1
0
def test_tacotron2_train_some_layers(var_train_expr, config_path):
    config = Tacotron2Config(n_speakers=5, reduction_factor=1)
    model = TFTacotron2(config, training=True)
    model._build()
    optimizer = tf.keras.optimizers.Adam(lr=0.001)

    with open(config_path) as f:
        config = yaml.load(f, Loader=yaml.Loader)

    config.update({"outdir": "./"})
    config.update({"var_train_expr": var_train_expr})

    STRATEGY = return_strategy()

    trainer = Tacotron2Trainer(
        config=config,
        strategy=STRATEGY,
        steps=0,
        epochs=0,
        is_mixed_precision=False,
    )
    trainer.compile(model, optimizer)

    len_trainable_vars = len(trainer._trainable_variables)
    all_trainable_vars = len(model.trainable_variables)

    if var_train_expr is None:
        tf.debugging.assert_equal(len_trainable_vars, all_trainable_vars)
    else:
        tf.debugging.assert_less(len_trainable_vars, all_trainable_vars)
def main():
  """Run training process."""
  parser = argparse.ArgumentParser(
    description="Train MultiBand MelGAN (See detail in examples/multiband_melgan/train_multiband_melgan.py)"
  )
  parser.add_argument("--feature", '-f', required=True)
  parser.add_argument("--config", '-c', required=True)
  parser.add_argument("--restore_am", '-am', required=True)
  parser.add_argument("--restore_iam", '-iam', required=True)
  args = parser.parse_args()

  # return strategy
  STRATEGY = return_strategy()

  # load and save config
  with open(args.config) as f:
    config = yaml.load(f, Loader=yaml.Loader)
  with open(config['speech_config']) as f:
    speech_config = yaml.load(f, Loader=yaml.Loader)
  config.update(speech_config)
  config['n_mels'] = config['asr_features']
  config['hop_size'] = config['asr_downsample'] * config['sample_rate'] * config['stride_ms'] // 1000

  config.update(vars(args))
  config["version"] = tensorflow_tts.__version__
  for key, value in config.items():
    logging.info(f"{key} = {value}")

  with STRATEGY.scope():
    generator = MelGANGenerator(
      config=MultiBandMelGANGeneratorConfig(
        **config["multiband_melgan_generator_params"]
      ),
      name="multi_band_melgan_generator",
    )
    generator.set_shape(config['n_mels'])

    pqmf = TFPQMF(
      MultiBandMelGANGeneratorConfig(
        **config["multiband_melgan_generator_params"]
      ),
      dtype=tf.float32,
      name="pqmf",
    )

    # dummy input to build model.
    fake_mels = tf.random.uniform(shape=[1, 100, config['n_mels']], dtype=tf.float32)
    output = generator(mels=fake_mels, training=False)
    y_hat = pqmf.synthesis(output)
    print('y_hat', y_hat.shape)
    generator.load_weights(args.resume)


  elif args.feature.endswith('.wav'):
    signal, _ = librosa.load(args.feature, sr=config['sample_rate'])
    mels = speech_featurizer.tf_extract(signal)
    with tf.device('/cpu:0'):
      mels = conformer.encoder_inference(mels)
def main():
    """Run training process."""
    parser = argparse.ArgumentParser(
        description="Train FastSpeech (See detail in tensorflow_tts/bin/train-fastspeech.py)"
    )
    parser.add_argument(
        "--train-dir",
        default=None,
        type=str,
        help="directory including training data. ",
    )
    parser.add_argument(
        "--dev-dir",
        default=None,
        type=str,
        help="directory including development data. ",
    )
    parser.add_argument(
        "--use-norm", default=1, type=int, help="usr norm-mels for train or raw."
    )
    parser.add_argument(
        "--f0-stat",
        default="./dump/stats_f0.npy",
        type=str,
        required=True,
        help="f0-stat path.",
    )
    parser.add_argument(
        "--energy-stat",
        default="./dump/stats_energy.npy",
        type=str,
        required=True,
        help="energy-stat path.",
    )
    parser.add_argument(
        "--outdir", type=str, required=True, help="directory to save checkpoints."
    )
    parser.add_argument(
        "--config", type=str, required=True, help="yaml format configuration file."
    )
    parser.add_argument(
        "--resume",
        default="",
        type=str,
        nargs="?",
        help='checkpoint file path to resume training. (default="")',
    )
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)",
    )
    parser.add_argument(
        "--mixed_precision",
        default=0,
        type=int,
        help="using mixed precision for generator or not.",
    )
    parser.add_argument(
        "--pretrained",
        default="",
        type=str,
        nargs="?",
        help="pretrained weights .h5 file to load weights from. Auto-skips non-matching layers",
    )

    args = parser.parse_args()

    # return strategy
    STRATEGY = return_strategy()

    # set mixed precision config
    if args.mixed_precision == 1:
        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

    args.mixed_precision = bool(args.mixed_precision)
    args.use_norm = bool(args.use_norm)

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            stream=sys.stdout,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            stream=sys.stdout,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            stream=sys.stdout,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # check directory existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # check arguments
    if args.train_dir is None:
        raise ValueError("Please specify --train-dir")
    if args.dev_dir is None:
        raise ValueError("Please specify --valid-dir")

    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))
    config["version"] = tensorflow_tts.__version__
    with open(os.path.join(args.outdir, "config.yml"), "w") as f:
        yaml.dump(config, f, Dumper=yaml.Dumper)
    for key, value in config.items():
        logging.info(f"{key} = {value}")

    # get dataset
    if config["remove_short_samples"]:
        mel_length_threshold = config["mel_length_threshold"]
    else:
        mel_length_threshold = None

    if config["format"] == "npy":
        charactor_query = "*-ids.npy"
        mel_query = "*-raw-feats.npy" if args.use_norm is False else "*-norm-feats.npy"
        duration_query = "*-durations.npy"
        f0_query = "*-raw-f0.npy"
        energy_query = "*-raw-energy.npy"
    else:
        raise ValueError("Only npy are supported.")

    # define train/valid dataset
    train_dataset = CharactorDurationF0EnergyMelDataset(
        root_dir=args.train_dir,
        charactor_query=charactor_query,
        mel_query=mel_query,
        duration_query=duration_query,
        f0_query=f0_query,
        energy_query=energy_query,
        f0_stat=args.f0_stat,
        energy_stat=args.energy_stat,
        mel_length_threshold=mel_length_threshold,
    ).create(
        is_shuffle=config["is_shuffle"],
        allow_cache=config["allow_cache"],
        batch_size=config["batch_size"]
        * STRATEGY.num_replicas_in_sync
        * config["gradient_accumulation_steps"],
    )

    valid_dataset = CharactorDurationF0EnergyMelDataset(
        root_dir=args.dev_dir,
        charactor_query=charactor_query,
        mel_query=mel_query,
        duration_query=duration_query,
        f0_query=f0_query,
        energy_query=energy_query,
        f0_stat=args.f0_stat,
        energy_stat=args.energy_stat,
        mel_length_threshold=mel_length_threshold,
    ).create(
        is_shuffle=config["is_shuffle"],
        allow_cache=config["allow_cache"],
        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync,
    )

    # define trainer
    trainer = FastSpeech2Trainer(
        config=config,
        strategy=STRATEGY,
        steps=0,
        epochs=0,
        is_mixed_precision=args.mixed_precision,
    )

    with STRATEGY.scope():
        # define model
        fastspeech = TFFastSpeech2(
            config=FastSpeech2Config(**config["fastspeech2_params"])
        )
        fastspeech._build()
        fastspeech.summary()
        if len(args.pretrained) > 1:
            fastspeech.load_weights(args.pretrained, by_name=True, skip_mismatch=True)
            logging.info(
                f"Successfully loaded pretrained weight from {args.pretrained}."
            )

        # AdamW for fastspeech
        learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=config["optimizer_params"]["initial_learning_rate"],
            decay_steps=config["optimizer_params"]["decay_steps"],
            end_learning_rate=config["optimizer_params"]["end_learning_rate"],
        )

        learning_rate_fn = WarmUp(
            initial_learning_rate=config["optimizer_params"]["initial_learning_rate"],
            decay_schedule_fn=learning_rate_fn,
            warmup_steps=int(
                config["train_max_steps"]
                * config["optimizer_params"]["warmup_proportion"]
            ),
        )

        optimizer = AdamWeightDecay(
            learning_rate=learning_rate_fn,
            weight_decay_rate=config["optimizer_params"]["weight_decay"],
            beta_1=0.9,
            beta_2=0.98,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
        )

        _ = optimizer.iterations

    # compile trainer
    trainer.compile(model=fastspeech, optimizer=optimizer)

    # start training
    try:
        trainer.fit(
            train_dataset,
            valid_dataset,
            saved_path=os.path.join(config["outdir"], "checkpoints/"),
            resume=args.resume,
        )
    except KeyboardInterrupt:
        trainer.save_checkpoint()
        logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")
Example #4
0
def main():
    """Run training process."""
    parser = argparse.ArgumentParser(
        description=
        "Train MelGAN (See detail in tensorflow_tts/bin/train-melgan.py)")
    parser.add_argument(
        "--train-dir",
        default=None,
        type=str,
        help="directory including training data. ",
    )
    parser.add_argument(
        "--dev-dir",
        default=None,
        type=str,
        help="directory including development data. ",
    )
    parser.add_argument("--use-norm",
                        default=1,
                        type=int,
                        help="use norm mels for training or raw.")
    parser.add_argument("--outdir",
                        type=str,
                        required=True,
                        help="directory to save checkpoints.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument(
        "--resume",
        default="",
        type=str,
        nargs="?",
        help='checkpoint file path to resume training. (default="")',
    )
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)",
    )
    parser.add_argument(
        "--generator_mixed_precision",
        default=0,
        type=int,
        help="using mixed precision for generator or not.",
    )
    parser.add_argument(
        "--discriminator_mixed_precision",
        default=0,
        type=int,
        help="using mixed precision for discriminator or not.",
    )
    args = parser.parse_args()

    # return strategy
    STRATEGY = return_strategy()

    # set mixed precision config
    if args.generator_mixed_precision == 1 or args.discriminator_mixed_precision == 1:
        tf.config.optimizer.set_experimental_options(
            {"auto_mixed_precision": True})

    args.generator_mixed_precision = bool(args.generator_mixed_precision)
    args.discriminator_mixed_precision = bool(
        args.discriminator_mixed_precision)

    args.use_norm = bool(args.use_norm)

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # check directory existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # check arguments
    if args.train_dir is None:
        raise ValueError("Please specify --train-dir")
    if args.dev_dir is None:
        raise ValueError("Please specify either --valid-dir")

    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))
    config["version"] = tensorflow_tts.__version__
    with open(os.path.join(args.outdir, "config.yml"), "w") as f:
        yaml.dump(config, f, Dumper=yaml.Dumper)
    for key, value in config.items():
        logging.info(f"{key} = {value}")

    # get dataset
    if config["remove_short_samples"]:
        mel_length_threshold = config["batch_max_steps"] // config[
            "hop_size"] + 2 * config["melgan_generator_params"].get(
                "aux_context_window", 0)
    else:
        mel_length_threshold = None

    if config["format"] == "npy":
        audio_query = "*-wave.npy"
        mel_query = "*-raw-feats.npy" if args.use_norm is False else "*-norm-feats.npy"
        audio_load_fn = np.load
        mel_load_fn = np.load
    else:
        raise ValueError("Only npy are supported.")

    # define train/valid dataset
    train_dataset = AudioMelDataset(
        root_dir=args.train_dir,
        audio_query=audio_query,
        mel_query=mel_query,
        audio_load_fn=audio_load_fn,
        mel_load_fn=mel_load_fn,
        mel_length_threshold=mel_length_threshold,
    ).create(
        is_shuffle=config["is_shuffle"],
        map_fn=lambda items: collater(
            items,
            batch_max_steps=tf.constant(config["batch_max_steps"],
                                        dtype=tf.int32),
            hop_size=tf.constant(config["hop_size"], dtype=tf.int32),
        ),
        allow_cache=config["allow_cache"],
        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync,
    )

    valid_dataset = AudioMelDataset(
        root_dir=args.dev_dir,
        audio_query=audio_query,
        mel_query=mel_query,
        audio_load_fn=audio_load_fn,
        mel_load_fn=mel_load_fn,
        mel_length_threshold=mel_length_threshold,
    ).create(
        is_shuffle=config["is_shuffle"],
        map_fn=lambda items: collater(
            items,
            batch_max_steps=tf.constant(config["batch_max_steps_valid"],
                                        dtype=tf.int32),
            hop_size=tf.constant(config["hop_size"], dtype=tf.int32),
        ),
        allow_cache=config["allow_cache"],
        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync,
    )

    # define trainer
    trainer = MelganTrainer(
        steps=0,
        epochs=0,
        config=config,
        strategy=STRATEGY,
        is_generator_mixed_precision=args.generator_mixed_precision,
        is_discriminator_mixed_precision=args.discriminator_mixed_precision,
    )

    # define generator and discriminator
    with STRATEGY.scope():
        generator = TFMelGANGenerator(
            MELGAN_CONFIG.MelGANGeneratorConfig(
                **config["melgan_generator_params"]),
            name="melgan_generator",
        )

        discriminator = TFMelGANMultiScaleDiscriminator(
            MELGAN_CONFIG.MelGANDiscriminatorConfig(
                **config["melgan_discriminator_params"]),
            name="melgan_discriminator",
        )

        # dummy input to build model.
        fake_mels = tf.random.uniform(shape=[1, 100, 80], dtype=tf.float32)
        y_hat = generator(fake_mels)
        discriminator(y_hat)

        generator.summary()
        discriminator.summary()

        gen_optimizer = tf.keras.optimizers.Adam(
            **config["generator_optimizer_params"])
        dis_optimizer = tf.keras.optimizers.Adam(
            **config["discriminator_optimizer_params"])

    trainer.compile(
        gen_model=generator,
        dis_model=discriminator,
        gen_optimizer=gen_optimizer,
        dis_optimizer=dis_optimizer,
    )

    # start training
    try:
        trainer.fit(
            train_dataset,
            valid_dataset,
            saved_path=os.path.join(config["outdir"], "checkpoints/"),
            resume=args.resume,
        )
    except KeyboardInterrupt:
        trainer.save_checkpoint()
        logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")
def main():
    """Run training process."""
    parser = argparse.ArgumentParser(
        description="Train/finetune Tacotron2 with reinitiated Embeddings")
    parser.add_argument(
        "--train-dir",
        default=None,
        type=str,
        help="directory including training data. ",
    )
    parser.add_argument(
        "--dev-dir",
        default=None,
        type=str,
        help="directory including development data. ",
    )
    parser.add_argument("--use-norm",
                        default=1,
                        type=int,
                        help="usr norm-mels for train or raw.")
    parser.add_argument("--outdir",
                        type=str,
                        required=True,
                        help="directory to save checkpoints.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument(
        "--resume",
        default="",
        type=str,
        nargs="?",
        help='checkpoint file path to resume training. (default="")',
    )
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)",
    )
    parser.add_argument(
        "--mixed_precision",
        default=0,
        type=int,
        help="using mixed precision for generator or not.",
    )
    parser.add_argument(
        "--pretrained",
        default="",
        type=str,
        nargs="?",
        help=
        "pretrained weights .h5 file to load weights from. Auto-skips non-matching layers",
    )
    args = parser.parse_args()

    # return strategy
    STRATEGY = return_strategy()

    # set mixed precision config
    if args.mixed_precision == 1:
        tf.config.optimizer.set_experimental_options(
            {"auto_mixed_precision": True})

    args.mixed_precision = bool(args.mixed_precision)
    args.use_norm = bool(args.use_norm)

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # check directory existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # check arguments
    if args.train_dir is None:
        raise ValueError("Please specify --train-dir")
    if args.dev_dir is None:
        raise ValueError("Please specify --valid-dir")

    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))
    config["version"] = tensorflow_tts.__version__

    # get dataset
    if config["remove_short_samples"]:
        mel_length_threshold = config["mel_length_threshold"]
    else:
        mel_length_threshold = 0

    if config["format"] == "npy":
        charactor_query = "*-ids.npy"
        mel_query = "*-raw-feats.npy" if args.use_norm is False else "*-norm-feats.npy"
        charactor_load_fn = np.load
        mel_load_fn = np.load
    else:
        raise ValueError("Only npy are supported.")

    train_dataset = CharactorMelDataset(
        dataset=config["tacotron2_params"]["dataset"],
        root_dir=args.train_dir,
        charactor_query=charactor_query,
        mel_query=mel_query,
        charactor_load_fn=charactor_load_fn,
        mel_load_fn=mel_load_fn,
        mel_length_threshold=mel_length_threshold,
        reduction_factor=config["tacotron2_params"]["reduction_factor"],
        use_fixed_shapes=config["use_fixed_shapes"],
    )

    # update max_mel_length and max_char_length to config
    config.update({"max_mel_length": int(train_dataset.max_mel_length)})
    config.update({"max_char_length": int(train_dataset.max_char_length)})

    with open(os.path.join(args.outdir, "config.yml"), "w") as f:
        yaml.dump(config, f, Dumper=yaml.Dumper)
    for key, value in config.items():
        logging.info(f"{key} = {value}")

    train_dataset = train_dataset.create(
        is_shuffle=config["is_shuffle"],
        allow_cache=config["allow_cache"],
        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync *
        config["gradient_accumulation_steps"],
    )

    valid_dataset = CharactorMelDataset(
        dataset=config["tacotron2_params"]["dataset"],
        root_dir=args.dev_dir,
        charactor_query=charactor_query,
        mel_query=mel_query,
        charactor_load_fn=charactor_load_fn,
        mel_load_fn=mel_load_fn,
        mel_length_threshold=mel_length_threshold,
        reduction_factor=config["tacotron2_params"]["reduction_factor"],
        use_fixed_shapes=False,  # don't need apply fixed shape for evaluation.
    ).create(
        is_shuffle=config["is_shuffle"],
        allow_cache=config["allow_cache"],
        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync,
    )

    # define trainer
    trainer = Tacotron2Trainer(
        config=config,
        strategy=STRATEGY,
        steps=0,
        epochs=0,
        is_mixed_precision=args.mixed_precision,
    )

    with STRATEGY.scope():
        # define model.
        tacotron_config = Tacotron2Config(**config["tacotron2_params"])
        tacotron2 = TFTacotron2(config=tacotron_config, name="tacotron2")
        tacotron2._build()
        tacotron2.summary()

        if len(args.pretrained) > 1:
            tacotron2.load_weights(args.pretrained,
                                   by_name=True,
                                   skip_mismatch=True)
            logging.info(
                f"Successfully loaded pretrained weight from {args.pretrained}."
            )

        # re-define embedding
        NEW_VOCAB_SIZE = 44  # 149 -> LJSpeech-mapper, 44 -> TPI-mapper
        tacotron_config.vocab_size = NEW_VOCAB_SIZE
        new_embedding_layers = TFTacotronEmbeddings(tacotron_config,
                                                    name='embeddings')
        tacotron2.embeddings = new_embedding_layers

        # re-build model
        tacotron2._build()
        tacotron2.summary()

        # AdamW for tacotron2
        learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=config["optimizer_params"]
            ["initial_learning_rate"],
            decay_steps=config["optimizer_params"]["decay_steps"],
            end_learning_rate=config["optimizer_params"]["end_learning_rate"],
        )

        learning_rate_fn = WarmUp(
            initial_learning_rate=config["optimizer_params"]
            ["initial_learning_rate"],
            decay_schedule_fn=learning_rate_fn,
            warmup_steps=int(config["train_max_steps"] *
                             config["optimizer_params"]["warmup_proportion"]),
        )

        optimizer = AdamWeightDecay(
            learning_rate=learning_rate_fn,
            weight_decay_rate=config["optimizer_params"]["weight_decay"],
            beta_1=0.9,
            beta_2=0.98,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
        )

        _ = optimizer.iterations

    # compile trainer
    trainer.compile(model=tacotron2, optimizer=optimizer)

    # start training
    try:
        trainer.fit(
            train_dataset,
            valid_dataset,
            saved_path=os.path.join(config["outdir"], "checkpoints/"),
            resume=args.resume,
        )
    except KeyboardInterrupt:
        trainer.save_checkpoint()
        logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")
def main():
    """Run training process."""
    parser = argparse.ArgumentParser(
        description=
        "Train MultiBand MelGAN (See detail in examples/multiband_melgan/train_multiband_melgan.py)"
    )
    parser.add_argument("--feature", '-f', required=True)
    parser.add_argument("--speaker", '-s', required=True)
    parser.add_argument("--config", '-c', required=True)
    parser.add_argument("--resume", '-r', required=True)
    args = parser.parse_args()

    # return strategy
    STRATEGY = return_strategy()

    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    with open(config['speech_config']) as f:
        speech_config = yaml.load(f, Loader=yaml.Loader)
    config.update(speech_config)
    config['hop_size'] = config['sample_rate'] * config['stride_ms'] // 1000
    config['sampling_rate'] = config['sample_rate']

    config.update(vars(args))
    config["version"] = tensorflow_tts.__version__
    for key, value in config.items():
        logging.info(f"{key} = {value}")

    with STRATEGY.scope():
        encoder = Encoder(**config['encoder'])

        generator = MelGANGeneratorVQ(
            encoder=encoder,
            config=MultiBandMelGANGeneratorConfig(
                **config["multiband_melgan_generator_params"]),
            name="multi_band_melgan_generator",
        )
        generator.set_shape(config['n_mels'], config['gc_channels'])

        pqmf = TFPQMF(
            MultiBandMelGANGeneratorConfig(
                **config["multiband_melgan_generator_params"]),
            dtype=tf.float32,
            name="pqmf",
        )

        # dummy input to build model.
        fake_mels = tf.random.uniform(shape=[1, 100, config['n_mels']],
                                      dtype=tf.float32)
        fake_gc = tf.random.uniform(shape=[1, config['gc_channels']],
                                    dtype=tf.float32)
        y_mb_hat = generator(mels=fake_mels, gc=fake_gc,
                             training=False)['y_mb_hat']
        y_hat = pqmf.synthesis(y_mb_hat)

        generator.load_weights(args.resume)
        generator.summary()

    speech_featurizer = TFSpeechFeaturizer(speech_config)
    if args.feature.endswith('_mel.npy'):
        mels = tf.constant(np.load(args.feature), tf.float32)
    else:
        signal, _ = librosa.load(args.feature, sr=config['sample_rate'])
        mels = speech_featurizer.tf_extract(signal)
    mels = tf.reshape(mels, [1, -1, config['n_mels']])

    gc = tf.constant(
        np.load(args.speaker).reshape([1, config['gc_channels']]), tf.float32)
    # gc = tf.constant(np.zeros(256).reshape([1, config['gc_channels']]), tf.float32)
    output = generator(mels=mels, gc=gc, training=False)['y_mb_hat']
    y_hat = pqmf.synthesis(output).numpy().reshape([-1])
    print('output:', y_hat.shape)
    save_name = args.feature.replace('.wav', '_gen_vc.wav')
    save_name = args.feature.replace('_mel.npy', '_gen_vc.wav')
    save_name = save_name.split('/')[-1]
    wavfile.write(save_name, config['sample_rate'], y_hat)

    def depreemphasis(signal: np.ndarray, coeff=0.97):
        if not coeff or coeff <= 0.0: return signal
        x = np.zeros(signal.shape[0], dtype=np.float32)
        x[0] = signal[0]
        for n in range(1, signal.shape[0], 1):
            x[n] = coeff * x[n - 1] + signal[n]
        return x

    y_hat = depreemphasis(y_hat)
    wavfile.write(save_name.replace('.wav', '_depre.wav'),
                  config['sample_rate'], y_hat)
Example #7
0
from sklearn.preprocessing import StandardScaler
from tensorflow_tts.models import TFFastSpeech2
from tensorflow_tts.optimizers import AdamWeightDecay, WarmUp
from tensorflow_tts.trainers import Seq2SeqBasedTrainer
from tensorflow_tts.utils import (calculate_2d_loss, calculate_3d_loss,
                                  return_strategy)
from Processor import JSpeechProcessor

sys.path.append(".")

physical_devices = tf.config.list_physical_devices("GPU")
for i in range(len(physical_devices)):
    tf.config.experimental.set_memory_growth(physical_devices[i], True)

# return strategy
STRATEGY = return_strategy()
"""
datasets
    feats
        one.f32
        two.f32
    pcm
        one.s16
    f0
        one.f0
    durations
        one.dur
    energies
        one.e
    metadata.csv    
"""
Example #8
0
def main():
    """Run training process."""
    parser = argparse.ArgumentParser(
        description=
        "Train MultiBand MelGAN (See detail in examples/multiband_melgan/train_multiband_melgan.py)"
    )
    parser.add_argument(
        "--train-dir",
        '-td',
        default=None,
        type=str,
        help="directory including training data. ",
    )
    parser.add_argument(
        "--dev-dir",
        '-dd',
        default=None,
        type=str,
        help="directory including development data. ",
    )
    parser.add_argument(
        "--audio-query",
        '-aq',
        default='*_wav.npy',
        type=str,
        help="suffix of audio file",
    )
    parser.add_argument(
        "--mel-query",
        '-mq',
        default='*_conformer_enc16.npy',
        type=str,
        help="suffix of mel file",
    )
    parser.add_argument("--use-norm",
                        default=1,
                        type=int,
                        help="use norm mels for training or raw.")
    parser.add_argument("--outdir",
                        '-od',
                        type=str,
                        required=True,
                        help="directory to save checkpoints.")
    parser.add_argument("--config",
                        '-c',
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument(
        "--resume",
        '-r',
        default="",
        type=str,
        nargs="?",
        help='checkpoint file path to resume training. (default="")',
    )
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)",
    )
    parser.add_argument(
        "--generator_mixed_precision",
        default=0,
        type=int,
        help="using mixed precision for generator or not.",
    )
    parser.add_argument(
        "--discriminator_mixed_precision",
        default=0,
        type=int,
        help="using mixed precision for discriminator or not.",
    )
    parser.add_argument(
        "--pretrained",
        default="",
        type=str,
        nargs="?",
        help="path of .h5 mb-melgan generator to load weights from",
    )
    args = parser.parse_args()

    # return strategy
    STRATEGY = return_strategy()

    # set mixed precision config
    if args.generator_mixed_precision == 1 or args.discriminator_mixed_precision == 1:
        tf.config.optimizer.set_experimental_options(
            {"auto_mixed_precision": True})

    args.generator_mixed_precision = bool(args.generator_mixed_precision)
    args.discriminator_mixed_precision = bool(
        args.discriminator_mixed_precision)

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # check directory existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # check arguments
    if args.train_dir is None:
        raise ValueError("Please specify --train-dir")
    if args.dev_dir is None:
        raise ValueError("Please specify either --valid-dir")

    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    with open(config['speech_config']) as f:
        mel_config = yaml.load(f, Loader=yaml.Loader)
    config.update(mel_config)
    config['n_mels'] = config['asr_features']
    config['hop_size'] = config['asr_downsample'] * config[
        'sample_rate'] * config['stride_ms'] // 1000
    config['sampling_rate'] = config['sample_rate']

    config.update(vars(args))
    config["version"] = tensorflow_tts.__version__
    with open(os.path.join(args.outdir, "config.yml"), "w") as f:
        yaml.dump(config, f, Dumper=yaml.Dumper)
    for key, value in config.items():
        logging.info(f"{key} = {value}")

    # get dataset
    if config["remove_short_samples"]:
        mel_length_threshold = config["batch_max_steps"] // config["hop_size"] \
          + 2 * config["multiband_melgan_generator_params"].get("aux_context_window", 0)
    else:
        mel_length_threshold = None

    audio_query = args.audio_query
    mel_query = args.mel_query
    audio_load_fn = np.load
    mel_load_fn = np.load

    # define train/valid dataset
    train_dataset = Mel(
        n_mels=config['n_mels'],
        training=True,
        root_dir=args.train_dir,
        audio_query=audio_query,
        mel_query=mel_query,
        audio_load_fn=audio_load_fn,
        mel_load_fn=mel_load_fn,
        mel_length_threshold=mel_length_threshold,
    ).create(
        is_shuffle=config["is_shuffle"],
        map_fn=lambda items: collater(
            items,
            batch_max_steps=tf.constant(config["batch_max_steps"],
                                        dtype=tf.int32),
            hop_size=tf.constant(config["hop_size"], dtype=tf.int32),
        ),
        allow_cache=config["allow_cache"],
        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync *
        config["gradient_accumulation_steps"],
    )

    valid_dataset = Mel(
        n_mels=config['n_mels'],
        training=False,
        root_dir=args.dev_dir,
        audio_query=audio_query,
        mel_query=mel_query,
        audio_load_fn=audio_load_fn,
        mel_load_fn=mel_load_fn,
        mel_length_threshold=mel_length_threshold,
    ).create(
        is_shuffle=config["is_shuffle"],
        map_fn=lambda items: collater(
            items,
            batch_max_steps=tf.constant(config["batch_max_steps_valid"],
                                        dtype=tf.int32),
            hop_size=tf.constant(config["hop_size"], dtype=tf.int32),
        ),
        allow_cache=config["allow_cache"],
        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync,
    )

    # define trainer
    trainer = MultiBandMelganTrainer(
        steps=0,
        epochs=0,
        config=config,
        strategy=STRATEGY,
        is_generator_mixed_precision=args.generator_mixed_precision,
        is_discriminator_mixed_precision=args.discriminator_mixed_precision,
    )

    with STRATEGY.scope():
        # define generator and discriminator
        generator = MelGANGenerator(
            config=MultiBandMelGANGeneratorConfig(
                **config["multiband_melgan_generator_params"]),
            name="multi_band_melgan_generator",
        )
        generator.set_shape(config['n_mels'])

        discriminator = TFMelGANMultiScaleDiscriminator(
            MultiBandMelGANDiscriminatorConfig(
                **config["multiband_melgan_discriminator_params"]),
            name="multi_band_melgan_discriminator",
        )

        pqmf = TFPQMF(
            MultiBandMelGANGeneratorConfig(
                **config["multiband_melgan_generator_params"]),
            dtype=tf.float32,
            name="pqmf",
        )

        # dummy input to build model.
        fake_mels = tf.random.uniform(shape=[1, 100, config['n_mels']],
                                      dtype=tf.float32)
        y_mb_hat = generator(fake_mels)
        y_hat = pqmf.synthesis(y_mb_hat)
        discriminator(y_hat)

        if len(args.pretrained) > 1:
            generator.load_weights(args.pretrained)
            logging.info(
                f"Successfully loaded pretrained weight from {args.pretrained}."
            )

        generator.summary()
        discriminator.summary()

        # define optimizer
        generator_lr_fn = getattr(
            tf.keras.optimizers.schedules,
            config["generator_optimizer_params"]["lr_fn"])(
                **config["generator_optimizer_params"]["lr_params"])
        discriminator_lr_fn = getattr(
            tf.keras.optimizers.schedules,
            config["discriminator_optimizer_params"]["lr_fn"],
        )(**config["discriminator_optimizer_params"]["lr_params"])

        gen_optimizer = tf.keras.optimizers.Adam(
            learning_rate=generator_lr_fn,
            amsgrad=config["generator_optimizer_params"]["amsgrad"],
        )
        dis_optimizer = tf.keras.optimizers.Adam(
            learning_rate=discriminator_lr_fn,
            amsgrad=config["discriminator_optimizer_params"]["amsgrad"],
        )

    trainer.compile(
        gen_model=generator,
        dis_model=discriminator,
        gen_optimizer=gen_optimizer,
        dis_optimizer=dis_optimizer,
        pqmf=pqmf,
    )

    # start training
    try:
        trainer.fit(
            train_dataset,
            valid_dataset,
            saved_path=os.path.join(config["outdir"], "checkpoints/"),
            resume=args.resume,
        )
    except KeyboardInterrupt:
        trainer.save_checkpoint()
        logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")