def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name=None, use_cuda=False): """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text. Example: >>> synthesizer = torch.hub.load('coqui-ai/TTS', 'tts', source='github') >>> wavs = synthesizer.tts("This is a test! This is also a test!!") wavs - is a list of values of the synthesized speech. Args: model_name (str, optional): One of the model names from .model.json. Defaults to 'tts_models/en/ljspeech/tacotron2-DCA'. vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/multiband-melgan'. pretrained (bool, optional): [description]. Defaults to True. Returns: TTS.utils.synthesizer.Synthesizer: Synthesizer object wrapping both vocoder and tts models. """ manager = ModelManager() model_path, config_path, model_item = manager.download_model(model_name) vocoder_name = model_item[ 'default_vocoder'] if vocoder_name is None else vocoder_name vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) # create synthesizer synt = Synthesizer(tts_checkpoint=model_path, tts_config_path=config_path, vocoder_checkpoint=vocoder_path, vocoder_config=vocoder_config_path, use_cuda=use_cuda) return synt
def download_tts_model_and_vocodec(): path = Path(synthesize.__file__).parent / "../.models.json" manager = ModelManager(path) logger.info("Downloading model") manager.download_model(Tts.MODEL_NAME) logger.info("Downloading vcoder") manager.download_model(Tts.VOCODER_NAME) logger.info("Finished downloading TTS model & vocoder")
def __init__(self): manager = ModelManager() model_path, config_path, model_item = manager.download_model(MODEL) vocoder_path, vocoder_config_path, _ = manager.download_model( model_item['default_vocoder']) # last arg is use kuda, self.synth = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, False)
def __init__(self, lang="en-us", config=None): config = config or get_neon_tts_config().get("mozilla_local", {}) super(MozillaLocalTTS, self).__init__(lang, config, MozillaTTSValidator(self), audio_ext="mp3", ssml_tags=["speak"]) self.manager = ModelManager() self.models = self.manager.list_models() self.preferred_model = config.get("preferred_model", "tacotron2-DDC") self._get_synthesizer( lang) # Make sure we have a model available in init
def test_run_all_models(): """Check if all the models are downloadable and tts models run correctly.""" print(" > Run synthesizer with all the models.") download_dir = get_user_data_dir("tts") output_path = os.path.join(get_tests_output_path(), "output.wav") manager = ModelManager(output_prefix=get_tests_output_path()) model_names = manager.list_models() for model_name in model_names: print(f"\n > Run - {model_name}") model_path, _, _ = manager.download_model(model_name) if "tts_models" in model_name: local_download_dir = os.path.dirname(model_path) # download and run the model speaker_files = glob.glob(local_download_dir + "/speaker*") language_files = glob.glob(local_download_dir + "/language*") language_id = "" if len(speaker_files) > 0: # multi-speaker model if "speaker_ids" in speaker_files[0]: speaker_manager = SpeakerManager( speaker_id_file_path=speaker_files[0]) elif "speakers" in speaker_files[0]: speaker_manager = SpeakerManager( d_vectors_file_path=speaker_files[0]) # multi-lingual model - Assuming multi-lingual models are also multi-speaker if len(language_files ) > 0 and "language_ids" in language_files[0]: language_manager = LanguageManager( language_ids_file_path=language_files[0]) language_id = language_manager.language_names[0] speaker_id = list(speaker_manager.ids.keys())[0] run_cli( f"tts --model_name {model_name} " f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" ' ) else: # single-speaker model run_cli( f"tts --model_name {model_name} " f'--text "This is an example." --out_path "{output_path}"') # remove downloaded models shutil.rmtree(download_dir) else: # only download the model manager.download_model(model_name) print(f" | > OK: {model_name}") folders = glob.glob(os.path.join(manager.output_prefix, "*")) assert len(folders) == len(model_names) shutil.rmtree(manager.output_prefix)
def test_if_all_models_available(): """Check if all the models are downloadable.""" print( " > Checking the availability of all the models under the ModelManager." ) manager = ModelManager(output_prefix=get_tests_output_path()) model_names = manager.list_models() for model_name in model_names: manager.download_model(model_name) print(f" | > OK: {model_name}") folders = glob.glob(os.path.join(manager.output_prefix, "*")) assert len(folders) == len(model_names) shutil.rmtree(manager.output_prefix)
def __init__(self, auto_start: bool = True): super().__init__(auto_start) path = Path( __file__ ).parent / "../../.venv/lib/python3.8/site-packages/TTS/.models.json" manager = ModelManager(path) model_path, config_path = manager.download_model(self.model_name) vocoder_path, vocoder_config_path = manager.download_model( self.vocoder_name) self.synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, self.use_cuda)
def __init__(self): path = Path(synthesize.__file__).parent / "../.models.json" logger.info("path") logger.info("Creating ModelManager") self.manager = ModelManager(path) logger.info("Downloading model") model_path, config_path, _ = self.manager.download_model( self.MODEL_NAME) logger.info("Downloading vcoder") vocoder_path, vocoder_config_path, _ = self.manager.download_model( self.VOCODER_NAME) logger.info("Finished downloading TTS model & vcoder") self.synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, False) self.tts_lock = threading.Lock()
def generate(): if inputbox.get("1.0", "end-1c") == "": messagebox.showerror( message= "TTS will give a division by zero error if the text field is blank." ) else: if not os.path.exists('mozilla-tts-output'): try: os.makedirs('mozilla-tts-output') except OSError as e: if e.errno != errno.EEXIST: raise generatebutton.config(state="disabled") exportbutton.config(state="disabled") model_path = None config_path = None vocoder_path = None vocoder_config_path = None path = Path(__file__).parent / "TTS/.models.json" manager = ModelManager(path) model_name = 'tts_models/' + ttsmodelbox.get() print(f'model_name is {model_name}') # for dev #model_path, config_path, model_item = manager.download_model(model_name) # for master model_path, config_path = manager.download_model(model_name) vocoder_name = 'vocoder_models/' + vocodermodelbox.get() print(f'vocoder_name is {vocoder_name}') # for dev #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name) # for master vocoder_path, vocoder_config_path = manager.download_model( vocoder_name) synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, cudacheckbutton.instate(['selected'])) wav = synthesizer.tts(inputbox.get("1.0", "end-1c")) synthesizer.save_wav(wav, "mozilla-tts-output/generated.wav") playsound("mozilla-tts-output/generated.wav") generatebutton.config(state="enabled") exportbutton.config(state="enabled") print("All done!")
def exportaudio(): if inputbox.get("1.0", "end-1c") == "": messagebox.showerror( message= "TTS will give a division by zero error if the text field is blank." ) else: f = filedialog.asksaveasfile(mode='a', defaultextension=".wav", filetypes=[("Wave files", ".wav")]) if f is None: # asksaveasfile return `None` if dialog closed with "cancel". return generatebutton.config(state="disabled") exportbutton.config(state="disabled") model_path = None config_path = None vocoder_path = None vocoder_config_path = None path = Path(__file__).parent / "TTS/.models.json" manager = ModelManager(path) model_name = 'tts_models/' + ttsmodelbox.get() print(f'model_name is {model_name}') # for dev #model_path, config_path, model_item = manager.download_model(model_name) # for master model_path, config_path = manager.download_model(model_name) vocoder_name = 'vocoder_models/' + vocodermodelbox.get() print(f'vocoder_name is {vocoder_name}') # for dev #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name) # for master vocoder_path, vocoder_config_path = manager.download_model(vocoder_name) synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, cudacheckbutton.instate(['selected'])) wav = synthesizer.tts(inputbox.get("1.0", "end-1c")) synthesizer.save_wav(wav, str(f.name)) generatebutton.config(state="enabled") exportbutton.config(state="enabled") print("All done!")
def make_synthesizer(model_name, use_cuda): # load model manager path = Path(TTS.__file__).parent / ".models.json" manager = ModelManager(path) model_path, config_path, model_item = manager.download_model(model_name) vocoder_name = model_item["default_vocoder"] vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) speakers_file_path = None encoder_path = None encoder_config_path = None return Synthesizer( model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, encoder_path, encoder_config_path, use_cuda, )
parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.") parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.") return parser # parse the args args = create_argparser().parse_args() path = Path(__file__).parent / "../.models.json" manager = ModelManager(path) if args.list_models: manager.list_models() sys.exit() # update in-use models to the specified released models. model_path = None config_path = None speakers_file_path = None vocoder_path = None vocoder_config_path = None # CASE1: list pre-trained TTS models if args.list_models: manager.list_models()
def main(): description = """Synthesize speech on command line. You can either use your trained model or choose a model from the provided list. If you don't specify any models, then it uses LJSpeech based English model. ## Example Runs ### Single Speaker Models - List provided models: ``` $ tts --list_models ``` - Query info for model info by idx: ``` $ tts --model_info_by_idx "<model_type>/<model_query_idx>" ``` - Query info for model info by full name: ``` $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>" ``` - Run TTS with default models: ``` $ tts --text "Text for TTS" ``` - Run a TTS model with its default vocoder model: ``` $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" ``` - Run with specific TTS and vocoder models from the list: ``` $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path ``` - Run your own TTS model (Using Griffin-Lim Vocoder): ``` $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav ``` - Run your own TTS and Vocoder models: ``` $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json ``` ### Multi-speaker Models - List the available speakers and choose as <speaker_id> among them: ``` $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs ``` - Run the multi-speaker TTS model with the target speaker ID: ``` $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id> ``` - Run your own multi-speaker TTS model: ``` $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id> ``` """ # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep # documentation in sync more easily. parser = argparse.ArgumentParser( description=description.replace(" ```\n", ""), formatter_class=RawTextHelpFormatter, ) parser.add_argument( "--list_models", type=str2bool, nargs="?", const=True, default=False, help="list available pre-trained TTS and vocoder models.", ) parser.add_argument( "--model_info_by_idx", type=str, default=None, help="model info using query format: <model_type>/<model_query_idx>", ) parser.add_argument( "--model_info_by_name", type=str, default=None, help="model info using query format: <model_type>/<language>/<dataset>/<model_name>", ) parser.add_argument("--text", type=str, default=None, help="Text to generate speech.") # Args for running pre-trained TTS models. parser.add_argument( "--model_name", type=str, default="tts_models/en/ljspeech/tacotron2-DDC", help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>", ) parser.add_argument( "--vocoder_name", type=str, default=None, help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>", ) # Args for running custom models parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.") parser.add_argument( "--model_path", type=str, default=None, help="Path to model file.", ) parser.add_argument( "--out_path", type=str, default="tts_output.wav", help="Output wav file path.", ) parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False) parser.add_argument( "--vocoder_path", type=str, help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).", default=None, ) parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) parser.add_argument( "--encoder_path", type=str, help="Path to speaker encoder model file.", default=None, ) parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) parser.add_argument( "--speaker_idx", type=str, help="Target speaker ID for a multi-speaker TTS model.", default=None, ) parser.add_argument( "--language_idx", type=str, help="Target language ID for a multi-lingual TTS model.", default=None, ) parser.add_argument( "--speaker_wav", nargs="+", help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.", default=None, ) parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None) parser.add_argument( "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None ) parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None) parser.add_argument( "--list_speaker_idxs", help="List available speaker ids for the defined multi-speaker model.", type=str2bool, nargs="?", const=True, default=False, ) parser.add_argument( "--list_language_idxs", help="List available language ids for the defined multi-lingual model.", type=str2bool, nargs="?", const=True, default=False, ) # aux args parser.add_argument( "--save_spectogram", type=bool, help="If true save raw spectogram for further (vocoder) processing in out_path.", default=False, ) parser.add_argument( "--reference_wav", type=str, help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav", default=None, ) parser.add_argument( "--reference_speaker_idx", type=str, help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).", default=None, ) args = parser.parse_args() # print the description if either text or list_models is not set check_args = [ args.text, args.list_models, args.list_speaker_idxs, args.list_language_idxs, args.reference_wav, args.model_info_by_idx, args.model_info_by_name, ] if not any(check_args): parser.parse_args(["-h"]) # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path) model_path = None config_path = None speakers_file_path = None language_ids_file_path = None vocoder_path = None vocoder_config_path = None encoder_path = None encoder_config_path = None # CASE1 #list : list pre-trained TTS models if args.list_models: manager.list_models() sys.exit() # CASE2 #info : model info of pre-trained TTS models if args.model_info_by_idx: model_query = args.model_info_by_idx manager.model_info_by_idx(model_query) sys.exit() if args.model_info_by_name: model_query_full_name = args.model_info_by_name manager.model_info_by_full_name(model_query_full_name) sys.exit() # CASE3: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model(args.model_name) args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None and not args.vocoder_path: vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) # CASE4: set custom model paths if args.model_path is not None: model_path = args.model_path config_path = args.config_path speakers_file_path = args.speakers_file_path language_ids_file_path = args.language_ids_file_path if args.vocoder_path is not None: vocoder_path = args.vocoder_path vocoder_config_path = args.vocoder_config_path if args.encoder_path is not None: encoder_path = args.encoder_path encoder_config_path = args.encoder_config_path # load models synthesizer = Synthesizer( model_path, config_path, speakers_file_path, language_ids_file_path, vocoder_path, vocoder_config_path, encoder_path, encoder_config_path, args.use_cuda, ) # query speaker ids of a multi-speaker model. if args.list_speaker_idxs: print( " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) print(synthesizer.tts_model.speaker_manager.ids) return # query langauge ids of a multi-lingual model. if args.list_language_idxs: print( " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." ) print(synthesizer.tts_model.language_manager.ids) return # check the arguments against a multi-speaker model. if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): print( " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." ) return # RUN THE SYNTHESIS if args.text: print(" > Text: {}".format(args.text)) # kick it wav = synthesizer.tts( args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, style_wav=args.capacitron_style_wav, style_text=args.capacitron_style_text, reference_speaker_name=args.reference_speaker_idx, ) # save the results print(" > Saving output to {}".format(args.out_path)) synthesizer.save_wav(wav, args.out_path)
epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=50, print_eval=False, mixed_precision=False, sort_by_audio_len=True, max_seq_len=500000, output_path=output_path, datasets=[dataset_config], ) # compute alignments if not config.model_args.use_aligner: manager = ModelManager() model_path, config_path, _ = manager.download_model( "tts_models/en/ljspeech/tacotron2-DCA") # TODO: make compute_attention python callable os.system( f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" ) # train the model args, config, output_path, _, c_logger, tb_logger = init_training( TrainingArgs(), config) trainer = Trainer(args, config, output_path, c_logger, tb_logger) trainer.fit()
def main(): # pylint: disable=bad-continuation parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n''' '''You can either use your trained model or choose a model from the provided list.\n\n'''\ '''If you don't specify any models, then it uses LJSpeech based English models\n\n'''\ ''' Example runs: # list provided models ./TTS/bin/synthesize.py --list_models # run tts with default models. ./TTS/bin synthesize.py --text "Text for TTS" # run a tts model with its default vocoder model. ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" # run with specific tts and vocoder models from the list ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path # run your own TTS model (Using Griffin-Lim Vocoder) ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav # run your own TTS and Vocoder models ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json ''', formatter_class=RawTextHelpFormatter) parser.add_argument( '--list_models', type=str2bool, nargs='?', const=True, default=False, help='list available pre-trained tts and vocoder models.') parser.add_argument('--text', type=str, default=None, help='Text to generate speech.') # Args for running pre-trained TTS models. parser.add_argument( '--model_name', type=str, default="tts_models/en/ljspeech/speedy-speech-wn", help= 'Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>' ) parser.add_argument( '--vocoder_name', type=str, default=None, help= 'Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>' ) # Args for running custom models parser.add_argument('--config_path', default=None, type=str, help='Path to model config file.') parser.add_argument( '--model_path', type=str, default=None, help='Path to model file.', ) parser.add_argument( '--out_path', type=str, default=Path(__file__).resolve().parent, help= 'Path to save final wav file. Wav file will be named as the given text.', ) parser.add_argument('--use_cuda', type=bool, help='Run model on CUDA.', default=False) parser.add_argument( '--vocoder_path', type=str, help= 'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).', default=None, ) parser.add_argument('--vocoder_config_path', type=str, help='Path to vocoder model config file.', default=None) # args for multi-speaker synthesis parser.add_argument('--speakers_json', type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument( '--speaker_idx', type=str, help= "if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.", default=None) parser.add_argument('--gst_style', help="Wav path file for GST stylereference.", default=None) # aux args parser.add_argument( '--save_spectogram', type=bool, help= "If true save raw spectogram for further (vocoder) processing in out_path.", default=False) args = parser.parse_args() # print the description if either text or list_models is not set if args.text is None and not args.list_models: parser.parse_args(['-h']) # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path) model_path = None config_path = None vocoder_path = None vocoder_config_path = None # CASE1: list pre-trained TTS models if args.list_models: manager.list_models() sys.exit() # CASE2: load pre-trained models if args.model_name is not None: model_path, config_path, model_item = manager.download_model( args.model_name) args.vocoder_name = model_item[ 'default_vocoder'] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None: vocoder_path, vocoder_config_path, _ = manager.download_model( args.vocoder_name) # CASE3: load custome models if args.model_path is not None: model_path = args.model_path config_path = args.config_path if args.vocoder_path is not None: vocoder_path = args.vocoder_path vocoder_config_path = args.vocoder_config_path # RUN THE SYNTHESIS # load models synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, args.use_cuda) print(" > Text: {}".format(args.text)) # # handle multi-speaker setting # if not model_config.use_external_speaker_embedding_file and args.speaker_idx is not None: # if args.speaker_idx.isdigit(): # args.speaker_idx = int(args.speaker_idx) # else: # args.speaker_idx = None # else: # args.speaker_idx = None # if args.gst_style is None: # if 'gst' in model_config.keys() and model_config.gst['gst_style_input'] is not None: # gst_style = model_config.gst['gst_style_input'] # else: # gst_style = None # else: # # check if gst_style string is a dict, if is dict convert else use string # try: # gst_style = json.loads(args.gst_style) # if max(map(int, gst_style.keys())) >= model_config.gst['gst_style_tokens']: # raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), model_config.gst['gst_style_tokens'])) # except ValueError: # gst_style = args.gst_style # kick it wav = synthesizer.tts(args.text) # save the results file_name = args.text.replace(" ", "_")[0:20] file_name = file_name.translate( str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' out_path = os.path.join(args.out_path, file_name) print(" > Saving output to {}".format(out_path)) synthesizer.save_wav( wav, out_path, )
def main(): # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description="""Synthesize speech on command line.\n\n""" """You can either use your trained model or choose a model from the provided list.\n\n""" """If you don't specify any models, then it uses LJSpeech based English model.\n\n""" """ # Example Runs: ## Single Speaker Models - list provided models ``` $ ./TTS/bin/synthesize.py --list_models ``` - run tts with default models. ``` $ ./TTS/bin synthesize.py --text "Text for TTS" ``` - run a tts model with its default vocoder model. ``` $ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name> ``` - run with specific tts and vocoder models from the list ``` $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path ``` - run your own TTS model (Using Griffin-Lim Vocoder) ``` $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav ``` - run your own TTS and Vocoder models ``` $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json ``` ## MULTI-SPEAKER MODELS - list the available speakers and choose as <speaker_id> among them. ``` $ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs ``` - run the multi-speaker TTS model with the target speaker ID. ``` $ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id> ``` - run your own multi-speaker TTS model. ``` $ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id> ``` """, formatter_class=RawTextHelpFormatter, ) parser.add_argument( "--list_models", type=str2bool, nargs="?", const=True, default=False, help="list available pre-trained tts and vocoder models.", ) parser.add_argument("--text", type=str, default=None, help="Text to generate speech.") # Args for running pre-trained TTS models. parser.add_argument( "--model_name", type=str, default="tts_models/en/ljspeech/tacotron2-DDC", help= "Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>", ) parser.add_argument( "--vocoder_name", type=str, default=None, help= "Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>", ) # Args for running custom models parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.") parser.add_argument( "--model_path", type=str, default=None, help="Path to model file.", ) parser.add_argument( "--out_path", type=str, default="tts_output.wav", help="Output wav file path.", ) parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False) parser.add_argument( "--vocoder_path", type=str, help= "Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).", default=None, ) parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) parser.add_argument( "--encoder_path", type=str, help="Path to speaker encoder model file.", default=None, ) parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument( "--speaker_idx", type=str, help="Target speaker ID for a multi-speaker TTS model.", default=None, ) parser.add_argument( "--speaker_wav", nargs="+", help= "wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.", default=None, ) parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) parser.add_argument( "--list_speaker_idxs", help="List available speaker ids for the defined multi-speaker model.", type=str2bool, nargs="?", const=True, default=False, ) # aux args parser.add_argument( "--save_spectogram", type=bool, help= "If true save raw spectogram for further (vocoder) processing in out_path.", default=False, ) args = parser.parse_args() # print the description if either text or list_models is not set if args.text is None and not args.list_models and not args.list_speaker_idxs: parser.parse_args(["-h"]) # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path) model_path = None config_path = None speakers_file_path = None vocoder_path = None vocoder_config_path = None encoder_path = None encoder_config_path = None # CASE1: list pre-trained TTS models if args.list_models: manager.list_models() sys.exit() # CASE2: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model( args.model_name) args.vocoder_name = model_item[ "default_vocoder"] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None and not args.vocoder_path: vocoder_path, vocoder_config_path, _ = manager.download_model( args.vocoder_name) # CASE3: set custome model paths if args.model_path is not None: model_path = args.model_path config_path = args.config_path speakers_file_path = args.speakers_file_path if args.vocoder_path is not None: vocoder_path = args.vocoder_path vocoder_config_path = args.vocoder_config_path if args.encoder_path is not None: encoder_path = args.encoder_path encoder_config_path = args.encoder_config_path # load models synthesizer = Synthesizer( model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, encoder_path, encoder_config_path, args.use_cuda, ) # query speaker ids of a multi-speaker model. if args.list_speaker_idxs: print( " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) print(synthesizer.speaker_manager.speaker_ids) return # check the arguments against a multi-speaker model. if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): print( " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." ) return # RUN THE SYNTHESIS print(" > Text: {}".format(args.text)) # kick it wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav) # save the results print(" > Saving output to {}".format(args.out_path)) synthesizer.save_wav(wav, args.out_path)