Beispiel #1
0
def do_phonemize(args):
    """Generate phonemes for text using config"""
    from TTS.utils.io import load_config
    from TTS.tts.utils.text import make_symbols, phoneme_to_sequence

    c = load_config(args.config)
    _, phonemes = make_symbols(**c.characters)

    if args.text:
        # Use arguments
        texts = args.text
    else:
        # Use stdin
        texts = sys.stdin

        if os.isatty(sys.stdin.fileno()):
            print("Reading text from stdin...", file=sys.stderr)

    for line in texts:
        line = line.strip()
        if not line:
            continue

        line_indexes = phoneme_to_sequence(
            line,
            [c.text_cleaner],
            language=c.phoneme_language,
            enable_eos_bos=False,
            tp=c.characters if "characters" in c.keys() else None,
            backend=c.phoneme_backend,
        )

        line_phonemes = [phonemes[i] for i in line_indexes]

        print(args.separator.join(line_phonemes))
Beispiel #2
0
    def get_characters(config: Coqpit) -> str:
        # TODO: implement CharacterProcessor
        if config.characters is not None:
            symbols, phonemes = make_symbols(**config.characters)
        else:
            from TTS.tts.utils.text.symbols import parse_symbols, phonemes, symbols

            config.characters = CharactersConfig(**parse_symbols())
        model_characters = phonemes if config.use_phonemes else symbols
        num_chars = len(model_characters) + getattr(config, "add_blank", False)
        return model_characters, config, num_chars
Beispiel #3
0
    def get_characters(config: Coqpit) -> str:
        # TODO: implement CharacterProcessor
        if config.characters is not None:
            symbols, phonemes = make_symbols(**config.characters)
        else:
            from TTS.tts.utils.text.symbols import (  # pylint: disable=import-outside-toplevel
                parse_symbols,
                phonemes,
                symbols,
            )

            config.characters = parse_symbols()
        model_characters = phonemes if config.use_phonemes else symbols
        return model_characters, config
Beispiel #4
0
    def _load_tts(self, tts_checkpoint: str, tts_config_path: str,
                  use_cuda: bool) -> None:
        """Load the TTS model.

        Args:
            tts_checkpoint (str): path to the model checkpoint.
            tts_config_path (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        # pylint: disable=global-statement

        global symbols, phonemes

        self.tts_config = load_config(tts_config_path)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)

        if "characters" in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)

        if self.tts_config.use_speaker_embedding is True:
            self.tts_speakers_file = (
                self.tts_speakers_file if self.tts_speakers_file else
                self.tts_config["external_speaker_embedding_file"])
            self._load_speakers(self.tts_speakers_file)

        self.tts_model = setup_model(
            self.input_size,
            num_speakers=self.num_speakers,
            c=self.tts_config,
            speaker_embedding_dim=self.speaker_embedding_dim,
        )
        self.tts_model.load_checkpoint(self.tts_config,
                                       tts_checkpoint,
                                       eval=True)
        if use_cuda:
            self.tts_model.cuda()
Beispiel #5
0
    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        # pylint: disable=global-statement
        global symbols, phonemes

        self.tts_config = load_config(tts_config)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)

        if 'characters' in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)

        self.tts_model = setup_model(self.input_size, num_speakers=self.num_speakers, c=self.tts_config)
        self.tts_model.load_checkpoint(tts_config, tts_checkpoint, eval=True)
        if use_cuda:
            self.tts_model.cuda()
Beispiel #6
0
    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        # pylint: disable=global-statement
        global symbols, phonemes

        print(" > Loading TTS model ...")
        print(" | > model config: ", tts_config)
        print(" | > checkpoint file: ", tts_checkpoint)

        self.tts_config = load_config(tts_config)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)

        if 'characters' in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)
        # TODO: fix this for multi-speaker model - load speakers
        if self.config.tts_speakers is not None:
            self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
            num_speakers = len(self.tts_speakers)
        else:
            num_speakers = 0
        self.tts_model = setup_model(self.input_size,
                                     num_speakers=num_speakers,
                                     c=self.tts_config)
        # load model state
        cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
        # load the model
        self.tts_model.load_state_dict(cp['model'])
        if use_cuda:
            self.tts_model.cuda()
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 3000
        if 'r' in cp:
            self.tts_model.decoder.set_r(cp['r'])
            print(f" > model reduction factor: {cp['r']}")
Beispiel #7
0
def do_verify_phonemes(args):
    """Verify that phoneme cache matches what gruut would produce"""
    import numpy as np
    from TTS.utils.io import load_config
    from TTS.tts.utils.text import make_symbols

    _LOGGER.debug("Loading gruut language %s", args.language)
    gruut_lang = gruut.Language.load(
        Path("~/.config/rhasspy/profiles/de/tts/larynx/de/thorsten/gruut/de-de"
             ).expanduser(),
        args.language,
    )
    assert gruut_lang, f"Unsupported language: {gruut_lang}"

    # Load config
    c = load_config(args.config)
    output_path = Path(c.output_path)
    phoneme_cache_dir = Path(c.phoneme_cache_path)
    _, phonemes = make_symbols(**c.characters)

    # Offset for pad
    phoneme_to_id = {p: (i + 1) for i, p in enumerate(phonemes)}

    # Add pad
    phoneme_to_id["_"] = 0

    # Include or exclude word break symbol (#)
    word_breaks = c.get("characters", {}).get("word_breaks", True)

    # Load lexicon and missing words
    lexicon = gruut_lang.phonemizer.lexicon

    missing_words_path = output_path / "missing_words.txt"
    if missing_words_path.is_file():
        _LOGGER.debug("Loading missing words from %s", missing_words_path)
        with open(missing_words_path, "r") as missing_words_file:
            gruut.utils.load_lexicon(missing_words_file, lexicon=lexicon)

    # Load metadata
    id_to_text = {}
    for ds in c.datasets:
        metadata_path = Path(ds["path"]) / ds["meta_file_train"]
        with open(metadata_path, "r") as metadata_file:
            for line in metadata_file:
                line = line.strip()
                if line:
                    item_id, item_text = line.split("|", maxsplit=1)
                    id_to_text[item_id] = item_text

    id_to_phonemes = {}
    for phoneme_path in phoneme_cache_dir.glob("*.npy"):
        item_id = re.sub("_phoneme$", "", phoneme_path.stem)
        _LOGGER.debug("Processing %s (id=%s)", phoneme_path, item_id)

        sequence = np.load(phoneme_path, allow_pickle=True)
        actual_phonemes = [phonemes[index] for index in sequence]

        expected_phonemes = id_to_phonemes.get(item_id)
        if not expected_phonemes:
            # Compute expected phonmemes
            expected_phonemes = []

            item_text = id_to_text[item_id]
            for sentence in gruut_lang.tokenizer.tokenize(item_text):
                # Choose first pronunciation for each word
                word_phonemes = [
                    wp[0] for wp in gruut_lang.phonemizer.phonemize(
                        sentence.clean_words,
                        word_indexes=True,
                        word_breaks=word_breaks,
                        separate_tones=None,
                    ) if wp
                ]

            expected_phonemes.extend(p for ps in word_phonemes for p in ps)

            # Associate with item id
            id_to_phonemes[item_id] = expected_phonemes

        assert (
            actual_phonemes == expected_phonemes
        ), f"Got {actual_phonemes}, expected {expected_phonemes} for '{item_text}'"

        print(item_id, "OK")
    def __init__(self, text, expected_output_audio_format, file_name):
        # set a pysbd segmenter to be used later to divide the input into segments
        self.seg = pysbd.Segmenter(language="en", clean=True)
        # runtime settings
        use_cuda = False

        # model paths - models and config files are taken from Mozilla TTS's github page
        TTS_MODEL = "/path/to/checkpoint_130000.pth.tar"
        TTS_CONFIG = "server/config/config.json"
        VOCODER_MODEL = "/path/to/checkpoint_1450000.pth.tar"
        VOCODER_CONFIG = "server/config/config_vocoder.json"

        # load configs
        TTS_CONFIG = load_config(TTS_CONFIG)
        self.TTS_CONFIG = TTS_CONFIG  # set it as a class variable to be later used by convert_audio_to()
        VOCODER_CONFIG = load_config(VOCODER_CONFIG)

        # load the audio processor
        ap = AudioProcessor(**TTS_CONFIG.audio)

        # LOAD TTS MODEL
        # multi speaker
        self.speaker_id = None
        self.speakers = []

        # use the imported symbols and phonemes
        global symbols, phonemes

        use_phonemes = TTS_CONFIG.use_phonemes

        if 'characters' in TTS_CONFIG.keys():
            symbols, phonemes = make_symbols(**TTS_CONFIG.characters)

        if use_phonemes:
            num_chars = len(phonemes)
        else:
            num_chars = len(symbols)

        # load the model
        model = setup_model(num_chars, len(self.speakers), TTS_CONFIG)

        # load model state
        cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))

        # load the model
        model.load_state_dict(cp['model'])
        if use_cuda:
            model.cuda()
        model.eval()

        model.decoder.max_decoder_steps = 3000

        # set model stepsize
        if 'r' in cp:
            model.decoder.set_r(cp['r'])

        # # LOAD VOCODER MODEL
        self.vocoder_model = setup_generator(VOCODER_CONFIG)
        self.vocoder_model.load_state_dict(
            torch.load(VOCODER_MODEL, map_location="cpu")["model"])
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0

        # ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])
        if use_cuda:
            self.vocoder_model.cuda()
        self.vocoder_model.eval()

        # TODO: need to train a model?
        wav = self.tts(model,
                       text,
                       TTS_CONFIG,
                       use_cuda,
                       ap,
                       use_gl=False,
                       figures=True)
        print(len(wav.tobytes()))

        # save the generated .wav file as (file_name + "_audio.wav")
        wavfile.write(file_name + "_audio.wav",
                      TTS_CONFIG.audio["sample_rate"], wav)

        # convert the generated audio file to the specifed audio format
        self.convert_audio_to(expected_output_audio_format, file_name)