예제 #1
0
    def _load_vocoder(self, model_file: str, model_config: str,
                      use_cuda: bool) -> None:
        """Load the vocoder model.

        1. Load the vocoder config.
        2. Init the AudioProcessor for the vocoder.
        3. Init the vocoder model from the config.
        4. Move the model to the GPU if CUDA is enabled.

        Args:
            model_file (str): path to the model checkpoint.
            model_config (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        self.vocoder_config = load_config(model_config)
        self.vocoder_ap = AudioProcessor(verbose=False,
                                         **self.vocoder_config.audio)
        self.vocoder_model = setup_vocoder_model(self.vocoder_config)
        self.vocoder_model.load_checkpoint(self.vocoder_config,
                                           model_file,
                                           eval=True)
        if use_cuda:
            self.vocoder_model.cuda()
예제 #2
0
    def test_speaker_embedding():
        # load config
        config = load_config(encoder_config_path)
        config.audio.resample = True

        # create a dummy speaker encoder
        model = setup_model(config)
        save_checkpoint(model, None, None, get_tests_input_path(), 0)

        # load audio processor and speaker encoder
        ap = AudioProcessor(**config.audio)
        manager = SpeakerManager(encoder_model_path=encoder_model_path,
                                 encoder_config_path=encoder_config_path)

        # load a sample audio and compute embedding
        waveform = ap.load_wav(sample_wav_path)
        mel = ap.melspectrogram(waveform)
        d_vector = manager.compute_d_vector(mel.T)
        assert d_vector.shape[1] == 256

        # compute d_vector directly from an input file
        d_vector = manager.compute_d_vector_from_clip(sample_wav_path)
        d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path)
        d_vector = torch.FloatTensor(d_vector)
        d_vector2 = torch.FloatTensor(d_vector2)
        assert d_vector.shape[0] == 256
        assert (d_vector - d_vector2).sum() == 0.0

        # compute d_vector from a list of wav files.
        d_vector3 = manager.compute_d_vector_from_clip(
            [sample_wav_path, sample_wav_path2])
        d_vector3 = torch.FloatTensor(d_vector3)
        assert d_vector3.shape[0] == 256
        assert (d_vector - d_vector3).sum() != 0.0

        # remove dummy model
        os.remove(encoder_model_path)
예제 #3
0
def main():
    # pylint: disable=bad-option-value
    parser = argparse.ArgumentParser(
        description=
        """Find all the unique characters or phonemes in a dataset.\n\n"""
        """
    Example runs:

    python TTS/bin/find_unique_chars.py --config_path config.json
    """,
        formatter_class=RawTextHelpFormatter,
    )
    parser.add_argument("--config_path",
                        type=str,
                        help="Path to dataset config file.",
                        required=True)
    args = parser.parse_args()

    c = load_config(args.config_path)

    # load all datasets
    train_items, eval_items = load_meta_data(c.datasets, eval_split=True)
    items = train_items + eval_items

    texts = "".join(item[0] for item in items)
    chars = set(texts)
    lower_chars = filter(lambda c: c.islower(), chars)
    chars_force_lower = [c.lower() for c in chars]
    chars_force_lower = set(chars_force_lower)

    print(f" > Number of unique characters: {len(chars)}")
    print(f" > Unique characters: {''.join(sorted(chars))}")
    print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
    print(
        f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}"
    )
예제 #4
0
    def _load_tts(self, tts_checkpoint: str, tts_config_path: str,
                  use_cuda: bool) -> None:
        """Load the TTS model.

        1. Load the model config.
        2. Init the AudioProcessor.
        3. Init the model from the config.
        4. Move the model to the GPU if CUDA is enabled.
        5. Init the speaker manager for the model.

        Args:
            tts_checkpoint (str): path to the model checkpoint.
            tts_config_path (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        # pylint: disable=global-statement
        self.tts_config = load_config(tts_config_path)
        if self.tts_config[
                "use_phonemes"] and self.tts_config["phonemizer"] is None:
            raise ValueError("Phonemizer is not defined in the TTS config.")

        self.tts_model = setup_tts_model(config=self.tts_config)

        if not self.encoder_checkpoint:
            self._set_speaker_encoder_paths_from_tts_config()

        self.tts_model.load_checkpoint(self.tts_config,
                                       tts_checkpoint,
                                       eval=True)
        if use_cuda:
            self.tts_model.cuda()

        if self.encoder_checkpoint and hasattr(self.tts_model,
                                               "speaker_manager"):
            self.tts_model.speaker_manager.init_encoder(
                self.encoder_checkpoint, self.encoder_config)
예제 #5
0
def main():
    # pylint: disable=W0601
    global c
    # pylint: disable=bad-option-value
    parser = argparse.ArgumentParser(
        description="""Find all the unique characters or phonemes in a dataset.\n\n"""
        """
    Example runs:

    python TTS/bin/find_unique_chars.py --config_path config.json
    """,
        formatter_class=RawTextHelpFormatter,
    )
    parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
    args = parser.parse_args()

    c = load_config(args.config_path)

    # load all datasets
    train_items, eval_items = load_tts_samples(c.datasets, eval_split=True)
    items = train_items + eval_items
    print("Num items:", len(items))

    phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
    phones = []
    for ph in phonemes:
        phones.extend(ph)
    phones = set(phones)
    lower_phones = filter(lambda c: c.islower(), phones)
    phones_force_lower = [c.lower() for c in phones]
    phones_force_lower = set(phones_force_lower)

    print(f" > Number of unique phonemes: {len(phones)}")
    print(f" > Unique phonemes: {''.join(sorted(phones))}")
    print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
    print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
예제 #6
0
    default="",
    help=
    "Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.",
)
parser.add_argument("--use_cuda",
                    type=bool,
                    help="flag to set cuda.",
                    default=True)
parser.add_argument(
    "--separator",
    type=str,
    help="Separator used in file if CSV is passed for data_path",
    default="|")
args = parser.parse_args()

c = load_config(args.config_path)
ap = AudioProcessor(**c["audio"])

data_path = args.data_path
split_ext = os.path.splitext(data_path)
sep = args.separator

if args.target_dataset != "":
    # if target dataset is defined
    dataset_config = [
        BaseDatasetConfig(name=args.target_dataset,
                          path=args.data_path,
                          meta_file_train=None,
                          meta_file_val=None),
    ]
    wav_files, _ = load_meta_data(dataset_config, eval_split=False)
예제 #7
0
def main():
    """Run `tts` model training directly by a `config.json` file."""
    # init trainer args
    train_args = TrainingArgs()
    parser = train_args.init_argparse(arg_prefix="")

    # override trainer args from comman-line args
    args, config_overrides = parser.parse_known_args()
    train_args.parse_args(args)

    # load config.json and register
    if args.config_path or args.continue_path:
        if args.config_path:
            # init from a file
            config = load_config(args.config_path)
            if len(config_overrides) > 0:
                config.parse_known_args(config_overrides, relaxed_parser=True)
        elif args.continue_path:
            # continue from a prev experiment
            config = load_config(
                os.path.join(args.continue_path, "config.json"))
            if len(config_overrides) > 0:
                config.parse_known_args(config_overrides, relaxed_parser=True)
        else:
            # init from console args
            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel

            config_base = BaseTrainingConfig()
            config_base.parse_known_args(config_overrides)
            config = register_config(config_base.model)()

    # load training samples
    train_samples, eval_samples = load_tts_samples(config.datasets,
                                                   eval_split=True)

    # setup audio processor
    ap = AudioProcessor(**config.audio)

    # init speaker manager
    if check_config_and_model_args(config, "use_speaker_embedding", True):
        speaker_manager = SpeakerManager(data_items=train_samples +
                                         eval_samples)
        if hasattr(config, "model_args"):
            config.model_args.num_speakers = speaker_manager.num_speakers
        else:
            config.num_speakers = speaker_manager.num_speakers
    elif check_config_and_model_args(config, "use_d_vector_file", True):
        if check_config_and_model_args(config, "use_speaker_encoder_as_loss",
                                       True):
            speaker_manager = SpeakerManager(
                d_vectors_file_path=config.model_args.d_vector_file,
                encoder_model_path=config.model_args.
                speaker_encoder_model_path,
                encoder_config_path=config.model_args.
                speaker_encoder_config_path,
                use_cuda=torch.cuda.is_available(),
            )
        else:
            speaker_manager = SpeakerManager(
                d_vectors_file_path=get_from_config_or_model_args(
                    config, "d_vector_file"))
        config.num_speakers = speaker_manager.num_speakers
        if hasattr(config, "model_args"):
            config.model_args.num_speakers = speaker_manager.num_speakers
    else:
        speaker_manager = None

    if check_config_and_model_args(config, "use_language_embedding", True):
        language_manager = LanguageManager(config=config)
        if hasattr(config, "model_args"):
            config.model_args.num_languages = language_manager.num_languages
        else:
            config.num_languages = language_manager.num_languages
    else:
        language_manager = None

    # init the model from config
    model = setup_model(config, speaker_manager, language_manager)

    # init the trainer and 🚀
    trainer = Trainer(
        train_args,
        config,
        config.output_path,
        model=model,
        train_samples=train_samples,
        eval_samples=eval_samples,
        training_assets={"audio_processor": ap},
        parse_command_line_args=False,
    )
    trainer.fit()
예제 #8
0
    "--old_file",
    type=str,
    help="Previous speakers.json file, only compute for new audios.",
    default=None)
parser.add_argument("--use_cuda",
                    type=bool,
                    help="flag to set cuda. Default False",
                    default=False)
parser.add_argument("--no_eval",
                    type=bool,
                    help="Do not compute eval?. Default False",
                    default=False)

args = parser.parse_args()

c_dataset = load_config(args.config_dataset_path)

meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets,
                                                   eval_split=not args.no_eval)

if meta_data_eval is None:
    wav_files = meta_data_train
else:
    wav_files = meta_data_train + meta_data_eval

encoder_manager = SpeakerManager(
    encoder_model_path=args.model_path,
    encoder_config_path=args.config_path,
    d_vectors_file_path=args.old_file,
    use_cuda=args.use_cuda,
)
예제 #9
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
    parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
    parser.add_argument("out_path", type=str, help="save path (directory and filename).")
    parser.add_argument(
        "--data_path",
        type=str,
        required=False,
        help="folder including the target set of wavs overriding dataset config.",
    )
    args, overrides = parser.parse_known_args()

    CONFIG = load_config(args.config_path)
    CONFIG.parse_known_args(overrides, relaxed_parser=True)

    # load config
    CONFIG.audio.signal_norm = False  # do not apply earlier normalization
    CONFIG.audio.stats_path = None  # discard pre-defined stats

    # load audio processor
    ap = AudioProcessor(**CONFIG.audio.to_dict())

    # load the meta data of target dataset
    if args.data_path:
        dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
    else:
        dataset_items = load_meta_data(CONFIG.datasets)[0]  # take only train data
    print(f" > There are {len(dataset_items)} files.")

    mel_sum = 0
    mel_square_sum = 0
    linear_sum = 0
    linear_square_sum = 0
    N = 0
    for item in tqdm(dataset_items):
        # compute features
        wav = ap.load_wav(item if isinstance(item, str) else item[1])
        linear = ap.spectrogram(wav)
        mel = ap.melspectrogram(wav)

        # compute stats
        N += mel.shape[1]
        mel_sum += mel.sum(1)
        linear_sum += linear.sum(1)
        mel_square_sum += (mel ** 2).sum(axis=1)
        linear_square_sum += (linear ** 2).sum(axis=1)

    mel_mean = mel_sum / N
    mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2)
    linear_mean = linear_sum / N
    linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2)

    output_file_path = args.out_path
    stats = {}
    stats["mel_mean"] = mel_mean
    stats["mel_std"] = mel_scale
    stats["linear_mean"] = linear_mean
    stats["linear_std"] = linear_scale

    print(f" > Avg mel spec mean: {mel_mean.mean()}")
    print(f" > Avg mel spec scale: {mel_scale.mean()}")
    print(f" > Avg linear spec mean: {linear_mean.mean()}")
    print(f" > Avg lienar spec scale: {linear_scale.mean()}")

    # set default config values for mean-var scaling
    CONFIG.audio.stats_path = output_file_path
    CONFIG.audio.signal_norm = True
    # remove redundant values
    del CONFIG.audio.max_norm
    del CONFIG.audio.min_level_db
    del CONFIG.audio.symmetric_norm
    del CONFIG.audio.clip_norm
    stats["audio_config"] = CONFIG.audio.to_dict()
    np.save(output_file_path, stats, allow_pickle=True)
    print(f" > stats saved to {output_file_path}")
예제 #10
0
    def download_model(self, model_name):
        """Download model files given the full model name.
        Model name is in the format
            'type/language/dataset/model'
            e.g. 'tts_model/en/ljspeech/tacotron'

        Every model must have the following files
            - *.pth.tar : pytorch model checkpoint file.
            - config.json : model config file.
            - scale_stats.npy (if exist): scale values for preprocessing.

        Args:
            model_name (str): model name as explained above.

        TODO: support multi-speaker models
        """
        # fetch model info from the dict
        model_type, lang, dataset, model = model_name.split("/")
        model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
        model_item = self.models_dict[model_type][lang][dataset][model]
        # set the model specific output path
        output_path = os.path.join(self.output_prefix, model_full_name)
        output_model_path = os.path.join(output_path, "model_file.pth.tar")
        output_config_path = os.path.join(output_path, "config.json")
        # NOTE : band-aid for removing phoneme support
        if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]:
            raise RuntimeError(
                " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models."
            )
        if os.path.exists(output_path):
            print(f" > {model_name} is already downloaded.")
        else:
            os.makedirs(output_path, exist_ok=True)
            print(f" > Downloading model to {output_path}")
            output_stats_path = os.path.join(output_path, "scale_stats.npy")
            output_speakers_path = os.path.join(output_path, "speakers.json")
            # download files to the output path
            if self._check_dict_key(model_item, "github_rls_url"):
                # download from github release
                # TODO: pass output_path
                self._download_zip_file(model_item["github_rls_url"],
                                        output_path)
            else:
                # download from gdrive
                self._download_gdrive_file(model_item["model_file"],
                                           output_model_path)
                self._download_gdrive_file(model_item["config_file"],
                                           output_config_path)
                if self._check_dict_key(model_item, "stats_file"):
                    self._download_gdrive_file(model_item["stats_file"],
                                               output_stats_path)

            # update the scale_path.npy file path in the model config.json
            if self._check_dict_key(
                    model_item,
                    "stats_file") or os.path.exists(output_stats_path):
                # set scale stats path in config.json
                config_path = output_config_path
                config = load_config(config_path)
                config.audio.stats_path = output_stats_path
                config.save_json(config_path)
            # update the speakers.json file path in the model config.json to the current path
            if os.path.exists(output_speakers_path):
                # set scale stats path in config.json
                config_path = output_config_path
                config = load_config(config_path)
                config.external_speaker_embedding_file = output_speakers_path
                config.save_json(config_path)
        return output_model_path, output_config_path, model_item
예제 #11
0
파일: training.py 프로젝트: coqui-ai/TTS
def process_args(args, config=None):
    """Process parsed comand line arguments and initialize the config if not provided.
    Args:
        args (argparse.Namespace or dict like): Parsed input arguments.
        config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
    Returns:
        c (TTS.utils.io.AttrDict): Config paramaters.
        out_path (str): Path to save models and logging.
        audio_path (str): Path to save generated test audios.
        c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
            logging to the console.
        dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
    TODO:
        - Interactive config definition.
    """
    if isinstance(args, tuple):
        args, coqpit_overrides = args
    if args.continue_path:
        # continue a previous training from its output folder
        experiment_path = args.continue_path
        args.config_path = os.path.join(args.continue_path, "config.json")
        args.restore_path, best_model = get_last_checkpoint(args.continue_path)
        if not args.best_path:
            args.best_path = best_model
    # init config if not already defined
    if config is None:
        if args.config_path:
            # init from a file
            config = load_config(args.config_path)
        else:
            # init from console args
            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel

            config_base = BaseTrainingConfig()
            config_base.parse_known_args(coqpit_overrides)
            config = register_config(config_base.model)()
    # override values from command-line args
    config.parse_known_args(coqpit_overrides, relaxed_parser=True)
    experiment_path = args.continue_path
    if not experiment_path:
        experiment_path = get_experiment_folder_path(config.output_path,
                                                     config.run_name)
    audio_path = os.path.join(experiment_path, "test_audios")
    config.output_log_path = experiment_path
    # setup rank 0 process in distributed training
    dashboard_logger = None
    if args.rank == 0:
        new_fields = {}
        if args.restore_path:
            new_fields["restore_path"] = args.restore_path
        new_fields["github_branch"] = get_git_branch()
        # if model characters are not set in the config file
        # save the default set to the config file for future
        # compatibility.
        if config.has("characters") and config.characters is None:
            used_characters = parse_symbols()
            new_fields["characters"] = used_characters
        copy_model_files(config, experiment_path, new_fields)
        dashboard_logger = logger_factory(config, experiment_path)
    c_logger = ConsoleLogger()
    return config, experiment_path, audio_path, c_logger, dashboard_logger