def main():
    parser = argparse.ArgumentParser(
        description=
        "Trains the vocoder from the synthesizer audios and the GTA synthesized mels, "
        "or ground truth mels.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--run_id", type=str, help= \
        "Name for this model instance. If a model state from the same run ID was previously "
        "saved, the training will restart from there. Pass -f to overwrite saved states and "
        "restart from scratch.")

    # parser.add_argument("datasets_root", type=str, help= \
    #     "Path to the directory containing your SV2TTS directory. Specifying --syn_dir or --voc_dir "
    #     "will take priority over this argument.")
    # parser.add_argument("--syn_dir", type=str, default=argparse.SUPPRESS, help= \
    #     "Path to the synthesizer directory that contains the ground truth mel spectrograms, "
    #     "the wavs and the embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/.")
    # parser.add_argument("--voc_dir", type=str, default=argparse.SUPPRESS, help= \
    #     "Path to the vocoder directory that contains the GTA synthesized mel spectrograms. "
    #     "Defaults to <datasets_root>/SV2TTS/vocoder/. Unused if --ground_truth is passed.")

    parser.add_argument("-m", "--models_dir", type=str, default="vocoder/saved_models/", help=\
        "Path to the directory that will contain the saved model weights, as well as backups "
        "of those weights and wavs generated during training.")

    parser.add_argument("-d", "--metadata_path", default="")

    parser.add_argument("-w", "--weights_path", default="")

    parser.add_argument("-g", "--ground_truth", action="store_true", help= \
        "Train on ground truth spectrograms (<datasets_root>/SV2TTS/synthesizer/mels).")

    parser.add_argument("-s", "--save_every", type=int, default=0, help= \
        "Number of steps between updates of the model on the disk. Set to 0 to never save the "
        "model.")

    parser.add_argument("-b", "--backup_every", type=int, default=15000, help= \
        "Number of steps between backups of the model. Set to 0 to never make backups of the "
        "model.")

    parser.add_argument("-f", "--force_restart", default=False, action="store_true", help= \
        "Do not load any saved model and restart from scratch.")

    args = parser.parse_args()

    # del args.datasets_root
    args.models_dir = Path(args.models_dir)
    args.models_dir.mkdir(exist_ok=True)

    # Run the training
    print_args(args, parser)
    train(**vars(args))
def encoder_train(run_id, user_folder):
    parser = argparse.ArgumentParser(
        description=
        "Trains the speaker encoder. You must have run encoder_preprocess.py first.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--run_id", type=str, default=run_id, help= \
        "Name for this model instance. If a model state from the same run ID was previously "
        "saved, the training will restart from there. Pass -f to overwrite saved states and "
        "restart from scratch.")
    parser.add_argument("--clean_data_root", type=Path, default='/home/ubuntu/VC_dataset/SV2TTS/encoder/', help= \
        "Path to the output directory of encoder_preprocess.py. If you left the default "
        "output directory when preprocessing, it should be <datasets_root>/SV2TTS/encoder/.")
    parser.add_argument("-m", "--models_dir", type=Path, default="/home/ubuntu/Real-Time-Voice-Cloning/encoder/saved_models/", help=\
        "Path to the output directory that will contain the saved model weights, as well as "
        "backups of those weights and plots generated during training.")
    parser.add_argument("-v", "--vis_every", type=int, default=20, help= \
        "Number of steps between updates of the loss and the plots.")
    parser.add_argument("-u", "--umap_every", type=int, default=100, help= \
        "Number of steps between updates of the umap projection. Set to 0 to never update the "
        "projections.")
    parser.add_argument("-s", "--save_every", type=int, default=500, help= \
        "Number of steps between updates of the model on the disk. Set to 0 to never save the "
        "model.")
    parser.add_argument("-b", "--backup_every", type=int, default=10, help= \
        "Number of steps between backups of the model. Set to 0 to never make backups of the "
        "model.")
    parser.add_argument("-f", "--force_restart", action="store_true", help= \
        "Do not load any saved model.")
    parser.add_argument("--visdom_server",
                        type=str,
                        default="http://localhost")
    parser.add_argument("--no_visdom", action="store_true", help= \
        "Disable visdom.")
    parser.add_argument(
        "--fine_tune_pretrained",
        type=Path,
        default=
        '/home/ubuntu/Real-Time-Voice-Cloning/encoder/saved_models/pretrained_cp.pt'
    )
    parser.add_argument("--user_folder", type=Path, default=user_folder)
    args = parser.parse_args()

    # Process the arguments
    args.models_dir.mkdir(exist_ok=True)

    # Run the training
    print_args(args, parser)
    ckpt = train(**vars(args))
    return ckpt
def main():
    parser = argparse.ArgumentParser(
        description=
        "Preprocesses audio files from datasets, encodes them as mel spectrograms "
        "and writes them to  the disk. Audio files are also saved, to be used by the "
        "vocoder for training.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("datasets_root", type=Path, help=\
        "Path to the directory containing your LibriSpeech/TTS datasets.")
    parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
        "Path to the output directory that will contain the mel spectrograms, the audios and the "
        "embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
    parser.add_argument("-n", "--n_processes", type=int, default=8, help=\
        "Number of processes in parallel.")
    parser.add_argument("-s", "--skip_existing", action="store_true", help=\
        "Whether to overwrite existing files with the same name. Useful if the preprocessing was "
        "interrupted.")
    parser.add_argument("--hparams", type=str, default="", help=\
        "Hyperparameter overrides as a comma-separated list of name-value pairs")
    parser.add_argument("-d",
                        "--datasets",
                        type=str,
                        default="librispeech_other")
    args = parser.parse_args()
    args.datasets = args.datasets.split(",")

    # Process the arguments
    if not hasattr(args, "out_dir"):
        args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")

    # Create directories
    assert args.datasets_root.exists()
    args.out_dir.mkdir(exist_ok=True, parents=True)

    # Preprocess the dataset
    print_args(args, parser)
    args.hparams = hparams.parse(args.hparams)

    preprocess_func = {
        "custom": preprocess_custom,
        "librispeech_other": preprocess_librispeech,
    }
    args = vars(args)

    for dataset in args.pop("datasets"):
        print("Preprocessing %s" % dataset)
        preprocess_func[dataset](**args)
def main():
    parser = argparse.ArgumentParser(
        description=
        "Creates embeddings for the synthesizer from the LibriSpeech utterances.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("synthesizer_root", type=Path, help=\
        "Path to the synthesizer training data that contains the audios and the train.txt file. "
        "If you let everything as default, it should be <datasets_root>/SV2TTS/synthesizer/.")
    parser.add_argument("-e", "--encoder_model_fpath", type=Path,
                        default="encoder/saved_models/pretrained.pt", help=\
        "Path your trained encoder model.")
    parser.add_argument("-n", "--n_processes", type=int, default=4, help= \
        "Number of parallel processes. An encoder is created for each, so you may need to lower "
        "this value on GPUs with low memory. Set it to 1 if CUDA is unhappy.")
    args = parser.parse_args()

    # Preprocess the dataset
    print_args(args, parser)
    create_embeddings(**vars(args))
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("synthesizer_root", type=Path, help= \
        "Path to the synthesizer training data that contains the audios and the train.txt file. "
        "If you let everything as default, it should be <datasets_root>/SV2TTS/synthesizer/.")
    parser.add_argument("output_dir", type=Path, help= \
        "Path to the output_synthesis")
    parser.add_argument("--num_speakers", type=int, default=4, help= \
        "Number of speakers used for testing")
    parser.add_argument("--num_utterances", type=int, default=4, help= \
        "Number of utterances per speaker used for testing")

    args = parser.parse_args()
    print_args(args, parser)

    meta_file_path = args.synthesizer_root.joinpath('train.txt')

    wav_path_dict, ppg_path_dict, embed_path_dict, speakers = parse_meta(
        args.synthesizer_root, meta_file_path)

    speakers = speakers[:args.num_speakers]
    for speaker in speakers:
        wav_paths = sorted([
            k for k, v in wav_path_dict.items() if v == speaker
        ])[:args.num_utterances]
        ppg_paths = sorted([
            k for k, v in ppg_path_dict.items() if v == speaker
        ])[:args.num_utterances]
        for target_speaker in speakers:
            pair_dir = args.output_dir.joinpath('{}-{}'.format(
                speaker, target_speaker))
            print(pair_dir)
            pair_dir.mkdir(parents=True, exist_ok=True)
            embed_paths = sorted(
                [k for k, v in embed_path_dict.items() if v == target_speaker])
            target_wav_paths = sorted(
                [k for k, v in wav_path_dict.items() if v == target_speaker])
            generated_wavs_avg = synthesize_ppg_batch_avg_embed(
                ppg_paths, embed_paths)
            generated_wavs_random, embed_idx = synthesize_ppg_batch_random_embed(
                ppg_paths, embed_paths)
            save_wavs(wav_paths, target_wav_paths, generated_wavs_avg,
                      generated_wavs_random, embed_idx, pair_dir)
Esempio n. 6
0
def main():
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter,
                      argparse.RawDescriptionHelpFormatter):
        pass

    parser = argparse.ArgumentParser(
        description=
        "Creates ground-truth aligned (GTA) spectrograms from the vocoder.",
        formatter_class=MyFormatter)
    parser.add_argument("datasets_root", type=str, help=\
        "Path to the directory containing your SV2TTS directory. If you specify both --in_dir and "
        "--out_dir, this argument won't be used.")
    parser.add_argument("--model_dir", type=str,
                        default="synthesizer/saved_models/logs-pretrained/", help=\
        "Path to the pretrained model directory.")
    parser.add_argument("-i", "--in_dir", type=str, default=argparse.SUPPRESS, help= \
        "Path to the synthesizer directory that contains the mel spectrograms, the wavs and the "
        "embeds. Defaults to  <datasets_root>/SV2TTS/synthesizer/.")
    parser.add_argument("-o", "--out_dir", type=str, default=argparse.SUPPRESS, help= \
        "Path to the output vocoder directory that will contain the ground truth aligned mel "
        "spectrograms. Defaults to <datasets_root>/SV2TTS/vocoder/.")
    parser.add_argument(
        "--hparams",
        default="",
        help="Hyperparameter overrides as a comma-separated list of name=value "
        "pairs")
    parser.add_argument("-gpuid", "--gpu_id", type=str, default='0', help= \
        "Select the GPU to run the code")
    args = parser.parse_args()
    print_args(args, parser)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
    modified_hp = hparams.parse(args.hparams)

    if not hasattr(args, "in_dir"):
        args.in_dir = os.path.join(args.datasets_root, "SV2TTS", "synthesizer")
    if not hasattr(args, "out_dir"):
        args.out_dir = os.path.join(args.datasets_root, "SV2TTS", "vocoder")

    run_synthesis(args.in_dir, args.out_dir, args.model_dir, modified_hp)
Esempio n. 7
0
def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument("run_id", type=str, help= \
        "Name for this model instance. If a model state from the same run ID was previously "
        "saved, the training will restart from there. Pass -f to overwrite saved states and "
        "restart from scratch.")
    parser.add_argument("-d", "--data_dir", type=Path, default="/datadrive/google-landmark/train_clean_processed_symlink", help=\
        "Path to preprocessed data")
    parser.add_argument("-vd", "--validate_data_dir", type=Path, default="/datadrive/google-landmark/train_clean_processed_symlink", help=\
        "Path to preprocessed data")
    parser.add_argument("-m", "--models_dir", type=Path, default="/datadrive/google-landmark/landmark-retrieval/GE2E/train_ckpts", help=\
        "Path to the output directory that will contain the saved model weights, as well as "
        "backups of those weights and plots generated during training.")
    parser.add_argument("-v", "--vis_every", type=int, default=250, help= \
        "Number of steps between updates of the loss and the plots.")
    parser.add_argument("-u", "--umap_every", type=int, default=250, help= \
        "Number of steps between updates of the umap projection. Set to 0 to never update the "
        "projections.")
    parser.add_argument("-s", "--save_every", type=int, default=250, help= \
        "Number of steps between updates of the model on the disk. Set to 0 to never save the "
        "model.")
    parser.add_argument("-b", "--backup_every", type=int, default=250, help= \
        "Number of steps between backups of the model. Set to 0 to never make backups of the "
        "model.")
    parser.add_argument("-ve", "--validate_every", type=int, default=250, help= \
        "Number of steps between validation step.")
    parser.add_argument("-f", "--force_restart", action="store_true", help= \
        "Do not load any saved model.")
    parser.add_argument("--visdom_server",
                        type=str,
                        default="http://localhost")
    parser.add_argument("--port", type=str, default="8870")
    parser.add_argument("--no_visdom", action="store_true", help= \
        "Disable visdom.")
    args = parser.parse_args()
    print_args(args, parser)

    return args
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("name", help="Name of the run and of the logging directory.")
    parser.add_argument("synthesizer_root", type=str, help=\
        "Path to the synthesizer training data that contains the audios and the train.txt file. "
        "If you let everything as default, it should be <datasets_root>/SV2TTS/synthesizer/.")
    parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\
        "Path to the output directory that will contain the saved model weights and the logs.")
    parser.add_argument("--mode", default="synthesis",
                        help="mode for synthesis of tacotron after training")
    parser.add_argument("--GTA", default="True",
                        help="Ground truth aligned synthesis, defaults to True, only considered "
							 "in Tacotron synthesis mode")
    parser.add_argument("--restore", type=bool, default=True,
                        help="Set this to False to do a fresh training")
    parser.add_argument("--summary_interval", type=int, default=2500,
                        help="Steps between running summary ops")
    parser.add_argument("--embedding_interval", type=int, default=10000,
                        help="Steps between updating embeddings projection visualization")
    parser.add_argument("--checkpoint_interval", type=int, default=2000, # Was 5000
                        help="Steps between writing checkpoints")
    parser.add_argument("--eval_interval", type=int, default=100000, # Was 10000
                        help="Steps between eval on test data")
    parser.add_argument("--tacotron_train_steps", type=int, default=2000000, # Was 100000
                        help="total number of tacotron training steps")
    parser.add_argument("--tf_log_level", type=int, default=1, help="Tensorflow C++ log level.")
    parser.add_argument("--slack_url", default=None,
                        help="slack webhook notification destination link")
    parser.add_argument("--hparams", default="",
                        help="Hyperparameter overrides as a comma-separated list of name=value "
							 "pairs")
    args = parser.parse_args()
    print_args(args, parser)
    
    log_dir, hparams = prepare_run(args)
    
    tacotron_train(args, log_dir, hparams)
Esempio n. 9
0
    # Verify webrtcvad is available
    if not args.no_trim:
        try:
            import webrtcvad
        except:
            raise ModuleNotFoundError(
                "Package 'webrtcvad' not found. This package enables "
                "noise removal and is recommended. Please install and try again. If installation fails, "
                "use --no_trim to disable this error message.")
    del args.no_trim

    # Process the arguments
    args.datasets = args.datasets.split(",")
    if not hasattr(args, "out_dir"):
        args.out_dir = args.datasets_root.joinpath("SV2TTS", "encoder")
    assert args.datasets_root.exists()
    args.out_dir.mkdir(exist_ok=True, parents=True)

    # Preprocess the datasets
    print_args(args, parser)
    preprocess_func = {
        "librispeech_other": preprocess_librispeech,
        "voxceleb1": preprocess_voxceleb1,
        "voxceleb2": preprocess_voxceleb2,
    }
    args = vars(args)
    for dataset in args.pop("datasets"):
        print("Preprocessing %s" % dataset)
        preprocess_func[dataset](**args)
Esempio n. 10
0
def set_args(filename):
    parser = argparse.ArgumentParser()
    # 可调参数
    parser.add_argument(
        "--train_epochs",
        default=20,  # 默认5
        type=int,
        help="训练次数大小")
    parser.add_argument("--embeddings_lr",
                        default=5e-4,
                        type=float,
                        help="Embeddings初始学习步长")
    parser.add_argument("--encoder_lr", default=5e-4, type=float)
    parser.add_argument("--learning_rate", default=5e-4, type=float)
    parser.add_argument("--weight_decay", default=0, type=float)
    parser.add_argument(
        "--train_batch_size",
        default=16,  # 默认8
        type=int,
        help="训练时batch大小")
    parser.add_argument(
        "--max_sent_len",
        default=128,  # 默认256
        type=int,
        help="文本最大长度")
    parser.add_argument("--test_size", default=.0, type=float, help="验证集大小")
    parser.add_argument(
        "--train_data_filename",
        default='data/rel_data/test.csv',
        type=str,
        help=
        "The input data filename. Should contain the .csv files (or other data files) for the "
        "task.")
    parser.add_argument("--test_data_filename",
                        default='data/rel_data/test.csv',
                        type=str)
    parser.add_argument("--train_data_dir",
                        default='data/rel_data/',
                        type=str,
                        help="The input data dir.")
    parser.add_argument("--test_data_dir", default='data/rel_data/', type=str)
    parser.add_argument("--mymodel_config_dir",
                        default='config/relation_classify_config.json',
                        type=str)
    parser.add_argument("--mymodel_save_dir",
                        default='checkpoint/relation_classify/',
                        type=str)
    parser.add_argument("--pretrained_model_dir",
                        default='pretrained_model/pytorch_electra_180g_large/',
                        type=str)
    parser.add_argument(
        "--vocab_dir",
        default='pretrained_model/pytorch_electra_180g_large/vocab.txt',
        type=str,
        help="The vocab data dir.")
    parser.add_argument("--rel2label",
                        default={
                            'Causal': 0,
                            'Follow': 1,
                            'Accompany': 2,
                            'Concurrency': 3,
                            'Other': 4
                        },
                        type=dict)
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="训练模式")
    parser.add_argument("--do_eval",
                        default=True,
                        action='store_true',
                        help="验证模式")
    parser.add_argument("--no_gpu",
                        default=False,
                        action='store_true',
                        help="用不用gpu")
    parser.add_argument("--seed", default=6, type=int, help="初始化时的随机数种子")
    parser.add_argument(
        "--gradient_accumulation_steps",
        default=1,
        type=int,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--optimize_on_cpu",
        default=False,
        action='store_true',
        help=
        "Whether to perform optimization and keep the optimizer averages on CPU."
    )
    parser.add_argument(
        "--fp16",
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit.")
    parser.add_argument(
        "--loss_scale",
        default=128,
        type=float,
        help=
        "Loss scaling, positive power of 2 values can improve fp16 convergence."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda", default=False, action='store_true')
    args = parser.parse_args()
    print_args(args, parser)
    with open(filename, 'w') as f:
        json.dump(args.__dict__, f, indent=2)
    return args
def hello_world():
    legoutput = upload_file()
    lig = "This is a demo utterance. This will work when you do not add any utterance."
    if request.method == 'POST':
        lig = request.form["textarea"]
    print(str(lig))
    #return mainpage()
    if str(legoutput) == "None":
        return render_template("index.html", output="")
    else:
        from encoder.params_model import model_embedding_size as speaker_embedding_size
        from utils.argutils import print_args
        from synthesizer.inference import Synthesizer
        from encoder import inference as encoder
        from vocoder import inference as vocoder
        from pathlib import Path
        import numpy as np
        import librosa
        import argparse
        import torch
        try:
            parser = argparse.ArgumentParser(
                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
            parser.add_argument("-e",
                                "--enc_model_fpath",
                                type=Path,
                                default="encoder/saved_models/pretrained.pt")
            parser.add_argument(
                "-s",
                "--syn_model_dir",
                type=Path,
                default="synthesizer/saved_models/logs-pretrained/")
            parser.add_argument(
                "-v",
                "--voc_model_fpath",
                type=Path,
                default="vocoder/saved_models/pretrained/pretrained.pt")
            parser.add_argument("--low_mem", action="store_true")
            #parser.add_argument("--no_sound", action="store_true")
            args = parser.parse_args()
            print_args(args, parser)
            #if not args.no_sound:
            #    import sounddevice as sd
            encoder.load_model(args.enc_model_fpath)
            synthesizer = Synthesizer(
                args.syn_model_dir.joinpath("taco_pretrained"),
                low_mem=args.low_mem)
            vocoder.load_model(args.voc_model_fpath)
            num_generated = 0
            in_fpath = legoutput[1]
            print(str(in_fpath))
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")
            text = str(lig)
            texts = [text]
            embeds = [embed]
            specs = synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")
            print("Synthesizing the waveform:")
            generated_wav = vocoder.infer_waveform(spec)
            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                                   mode="constant")
            #if not args.no_sound:
            #    sd.stop()
            #    sd.play(generated_wav, synthesizer.sample_rate)
            fpath = "static/output.wav"
            print(generated_wav.dtype)
            librosa.output.write_wav(fpath, generated_wav.astype(np.float32),
                                     synthesizer.sample_rate)
            print("\nSaved output as %s\n\n" % fpath)
            return render_template("index.html",
                                   output=htmloader(text, legoutput[1], fpath))
        except Exception as e:
            return render_template("index.html",
                                   output="Caught exception: %s" % repr(e))