Example #1
0
 def test_in_out(self):
     self._create_random_model()
     tts_root_path = get_tests_output_path()
     tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar")
     tts_config = os.path.join(tts_root_path, "dummy_model_config.json")
     synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
     synthesizer.tts("Better this test works!!")
Example #2
0
def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA',
        vocoder_name=None,
        use_cuda=False):
    """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text.

    Example:
        >>> synthesizer = torch.hub.load('coqui-ai/TTS', 'tts', source='github')
        >>> wavs = synthesizer.tts("This is a test! This is also a test!!")
            wavs - is a list of values of the synthesized speech.

    Args:
        model_name (str, optional): One of the model names from .model.json. Defaults to 'tts_models/en/ljspeech/tacotron2-DCA'.
        vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/multiband-melgan'.
        pretrained (bool, optional): [description]. Defaults to True.

    Returns:
        TTS.utils.synthesizer.Synthesizer: Synthesizer object wrapping both vocoder and tts models.
    """
    manager = ModelManager()

    model_path, config_path, model_item = manager.download_model(model_name)
    vocoder_name = model_item[
        'default_vocoder'] if vocoder_name is None else vocoder_name
    vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)

    # create synthesizer
    synt = Synthesizer(tts_checkpoint=model_path,
                       tts_config_path=config_path,
                       vocoder_checkpoint=vocoder_path,
                       vocoder_config=vocoder_config_path,
                       use_cuda=use_cuda)
    return synt
Example #3
0
 def test_in_out(self):
     self._create_random_model()
     config = load_config(os.path.join(get_tests_input_path(), 'server_config.json'))
     tts_root_path = get_tests_output_path()
     config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint'])
     config['tts_config'] = os.path.join(tts_root_path, config['tts_config'])
     synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None)
     synthesizer.tts("Better this test works!!")
Example #4
0
def tts(text):
    synthesizer = Synthesizer(MODEL_PATH, CONFIG_PATH, use_cuda)
    wav = synthesizer.tts(text)
    # save the results
    file_name = text.replace(" ", "_")[0:20]
    file_name = file_name.translate(
        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
    out_path = OUT_FILE
    print(" > Saving output to {}".format(out_path))
    synthesizer.save_wav(wav, out_path)
    playsound(out_path)
Example #5
0
    def __init__(self):

        manager = ModelManager()
        model_path, config_path, model_item = manager.download_model(MODEL)

        vocoder_path, vocoder_config_path, _ = manager.download_model(
            model_item['default_vocoder'])

        # last arg is use kuda,
        self.synth = Synthesizer(model_path, config_path, vocoder_path,
                                 vocoder_config_path, False)
Example #6
0
    def __init__(self, auto_start: bool = True):
        super().__init__(auto_start)

        path = Path(
            __file__
        ).parent / "../../.venv/lib/python3.8/site-packages/TTS/.models.json"
        manager = ModelManager(path)

        model_path, config_path = manager.download_model(self.model_name)
        vocoder_path, vocoder_config_path = manager.download_model(
            self.vocoder_name)

        self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                       vocoder_config_path, self.use_cuda)
Example #7
0
 def __init__(self):
     path = Path(synthesize.__file__).parent / "../.models.json"
     logger.info("path")
     logger.info("Creating ModelManager")
     self.manager = ModelManager(path)
     logger.info("Downloading model")
     model_path, config_path, _ = self.manager.download_model(
         self.MODEL_NAME)
     logger.info("Downloading vcoder")
     vocoder_path, vocoder_config_path, _ = self.manager.download_model(
         self.VOCODER_NAME)
     logger.info("Finished downloading TTS model & vcoder")
     self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                    vocoder_config_path, False)
     self.tts_lock = threading.Lock()
    def text_to_speech(self):
        tts_config = CONFIG['tts_config']
        models_folder = Path(tts_config['folder'])

        model_path = str(models_folder / tts_config['model'])
        model_config_path = str(models_folder / tts_config['model_config'])
        vocoder_path = str(models_folder / tts_config['vocoder'])
        vocoder_config_path = str(models_folder / tts_config['vocoder_config'])

        self.mozilla_tts = Synthesizer(model_path, model_config_path,
                                       vocoder_path, vocoder_config_path)

        while True:
            response = self.chatbot_to_tts_queue.get()
            print("TTS:", response)

            sound_arr = np.array(self.mozilla_tts.tts(response))

            sound_arr *= 2**15
            sound_arr = sound_arr.astype('int16')

            sound = bytes(sound_arr)
            sound, _ = audioop.ratecv(sound, 2, 1, self.MOZILLA_TTS_AUDIO_RATE,
                                      self.IN_AUDIO_RATE, None)

            ulaw_sound = audioop.lin2ulaw(sound, 2)

            chunk_len = 540
            chunks = len(ulaw_sound) // chunk_len
            extra = len(ulaw_sound) - (chunks * chunk_len)

            for c in range(chunks):
                chunk = ulaw_sound[c * chunk_len:c * chunk_len + chunk_len]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            if extra != 0:
                chunk = ulaw_sound[-extra:]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            self.transcript.append({
                "speaker": "self",
                "text": response,
                "datetime": dt.datetime.now().isoformat()
            })
def generate():
    if inputbox.get("1.0", "end-1c") == "":
        messagebox.showerror(
            message=
            "TTS will give a division by zero error if the text field is blank."
        )
    else:
        if not os.path.exists('mozilla-tts-output'):
            try:
                os.makedirs('mozilla-tts-output')
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
        generatebutton.config(state="disabled")
        exportbutton.config(state="disabled")
        model_path = None
        config_path = None
        vocoder_path = None
        vocoder_config_path = None
        path = Path(__file__).parent / "TTS/.models.json"
        manager = ModelManager(path)
        model_name = 'tts_models/' + ttsmodelbox.get()
        print(f'model_name is {model_name}')
        # for dev
        #model_path, config_path, model_item = manager.download_model(model_name)
        # for master
        model_path, config_path = manager.download_model(model_name)
        vocoder_name = 'vocoder_models/' + vocodermodelbox.get()
        print(f'vocoder_name is {vocoder_name}')
        # for dev
        #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name)
        # for master
        vocoder_path, vocoder_config_path = manager.download_model(
            vocoder_name)
        synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                  vocoder_config_path,
                                  cudacheckbutton.instate(['selected']))
        wav = synthesizer.tts(inputbox.get("1.0", "end-1c"))
        synthesizer.save_wav(wav, "mozilla-tts-output/generated.wav")
        playsound("mozilla-tts-output/generated.wav")
        generatebutton.config(state="enabled")
        exportbutton.config(state="enabled")
        print("All done!")
def exportaudio():
    if inputbox.get("1.0", "end-1c") == "":
        messagebox.showerror(
            message=
            "TTS will give a division by zero error if the text field is blank."
        )
    else:
        f = filedialog.asksaveasfile(mode='a',
                                     defaultextension=".wav",
                                     filetypes=[("Wave files", ".wav")])
        if f is None:  # asksaveasfile return `None` if dialog closed with "cancel".
            return
    generatebutton.config(state="disabled")
    exportbutton.config(state="disabled")
    model_path = None
    config_path = None
    vocoder_path = None
    vocoder_config_path = None
    path = Path(__file__).parent / "TTS/.models.json"
    manager = ModelManager(path)
    model_name = 'tts_models/' + ttsmodelbox.get()
    print(f'model_name is {model_name}')
    # for dev
    #model_path, config_path, model_item = manager.download_model(model_name)
    # for master
    model_path, config_path = manager.download_model(model_name)
    vocoder_name = 'vocoder_models/' + vocodermodelbox.get()
    print(f'vocoder_name is {vocoder_name}')
    # for dev
    #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name)
    # for master
    vocoder_path, vocoder_config_path = manager.download_model(vocoder_name)
    synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                              vocoder_config_path,
                              cudacheckbutton.instate(['selected']))
    wav = synthesizer.tts(inputbox.get("1.0", "end-1c"))
    synthesizer.save_wav(wav, str(f.name))
    generatebutton.config(state="enabled")
    exportbutton.config(state="enabled")
    print("All done!")
    def _get_synthesizer(self, language) -> Synthesizer:
        if '-' in language:
            language = language.split('-')[0]
        stopwatch = Stopwatch()
        with stopwatch:
            model_name = None

            for model in self.models:
                _, lang, dataset, name = model.split('/')
                print(f"{lang}|{name}")
                if language in lang:
                    model_name = model
                    if name == self.preferred_model:
                        break

            model_path, config_path, model_item = self.manager.download_model(
                model_name)
            vocoder_name = model_item.get(
                "default_vocoder",
                "vocoder_models/universal/libri-tts/fullband-melgan")
            vocoder_path, vocoder_config_path, _ = self.manager.download_model(
                vocoder_name)
            speakers_file_path = ''
            encoder_path = ''
            encoder_config_path = ''
            use_cuda = False

            synthesizer = Synthesizer(
                model_path,
                config_path,
                speakers_file_path,
                vocoder_path,
                vocoder_config_path,
                encoder_path,
                encoder_config_path,
                use_cuda,
            )
        LOG.debug(f"Get synthesizer time={stopwatch.time}")
        return synthesizer
Example #12
0
def make_synthesizer(model_name, use_cuda):
    # load model manager
    path = Path(TTS.__file__).parent / ".models.json"
    manager = ModelManager(path)

    model_path, config_path, model_item = manager.download_model(model_name)
    vocoder_name = model_item["default_vocoder"]
    vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)

    speakers_file_path = None
    encoder_path = None
    encoder_config_path = None

    return Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        use_cuda,
    )
def text_to_wav(text, lang):
    #class Synthesizer(object):
    #def __init__(self, tts_checkpoint, tts_config, vocoder_checkpoint=None, vocoder_config=None, use_cuda=False):
    tts_checkpoint = "/home/hector/.local/share/tts/tts_models--en--ljspeech--speedy-speech-wn/model_file.pth.tar"
    tts_config = "/home/hector/.local/share/tts/tts_models--en--ljspeech--speedy-speech-wn/config.json"
    #tts_checkpoint = "/home/hector/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar"
    #tts_config = "/home/hector/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json"
    if lang in ["Es"]:
        tts_checkpoint = "/home/hector/.local/share/tts/tts_models--es--mai--tacotron2-DDC/model_file.pth.tar"
        tts_config = "/home/hector/.local/share/tts/tts_models--es--mai--tacotron2-DDC/config.json"

    vocoder_checkpoint = "/home/hector/.local/share/tts/vocoder_models--universal--libri-tts--fullband-melgan/model_file.pth.tar"
    vocoder_config = "/home/hector/.local/share/tts/vocoder_models--universal--libri-tts--fullband-melgan/config.json"

    synthesizer = Synthesizer(tts_checkpoint, tts_config, vocoder_checkpoint,
                              vocoder_config)

    # kick it
    wav = synthesizer.tts(text)

    # save the results
    file_name = 'audio.wav'
    #print(" > Saving output to {}".format(file_name))
    synthesizer.save_wav(wav, file_name)
Example #14
0
def initsynthesizer(model_name,vocoder_name,use_cuda):
    model_path, config_path = manager.download_model(model_name)
    vocoder_path, vocoder_config_path = manager.download(vocoder_name)
    return Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, use_cuda)
Example #15
0
def main():
    # pylint: disable=bad-continuation
    parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n'''

    '''You can either use your trained model or choose a model from the provided list.\n\n'''\

    '''If you don't specify any models, then it uses LJSpeech based English models\n\n'''\

    '''
    Example runs:

    # list provided models
    ./TTS/bin/synthesize.py --list_models

    # run tts with default models.
    ./TTS/bin synthesize.py --text "Text for TTS"

    # run a tts model with its default vocoder model.
     ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"

    # run with specific tts and vocoder models from the list
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path

    # run your own TTS model (Using Griffin-Lim Vocoder)
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav

    # run your own TTS and Vocoder models
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json

    ''',
        formatter_class=RawTextHelpFormatter)

    parser.add_argument(
        '--list_models',
        type=str2bool,
        nargs='?',
        const=True,
        default=False,
        help='list available pre-trained tts and vocoder models.')
    parser.add_argument('--text',
                        type=str,
                        default=None,
                        help='Text to generate speech.')

    # Args for running pre-trained TTS models.
    parser.add_argument(
        '--model_name',
        type=str,
        default="tts_models/en/ljspeech/speedy-speech-wn",
        help=
        'Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>'
    )
    parser.add_argument(
        '--vocoder_name',
        type=str,
        default=None,
        help=
        'Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>'
    )

    # Args for running custom models
    parser.add_argument('--config_path',
                        default=None,
                        type=str,
                        help='Path to model config file.')
    parser.add_argument(
        '--model_path',
        type=str,
        default=None,
        help='Path to model file.',
    )
    parser.add_argument(
        '--out_path',
        type=str,
        default=Path(__file__).resolve().parent,
        help=
        'Path to save final wav file. Wav file will be named as the given text.',
    )
    parser.add_argument('--use_cuda',
                        type=bool,
                        help='Run model on CUDA.',
                        default=False)
    parser.add_argument(
        '--vocoder_path',
        type=str,
        help=
        'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
        default=None,
    )
    parser.add_argument('--vocoder_config_path',
                        type=str,
                        help='Path to vocoder model config file.',
                        default=None)

    # args for multi-speaker synthesis
    parser.add_argument('--speakers_json',
                        type=str,
                        help="JSON file for multi-speaker model.",
                        default=None)
    parser.add_argument(
        '--speaker_idx',
        type=str,
        help=
        "if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
        default=None)
    parser.add_argument('--gst_style',
                        help="Wav path file for GST stylereference.",
                        default=None)

    # aux args
    parser.add_argument(
        '--save_spectogram',
        type=bool,
        help=
        "If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False)

    args = parser.parse_args()

    # print the description if either text or list_models is not set
    if args.text is None and not args.list_models:
        parser.parse_args(['-h'])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    vocoder_path = None
    vocoder_config_path = None

    # CASE1: list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2: load pre-trained models
    if args.model_name is not None:
        model_path, config_path, model_item = manager.download_model(
            args.model_name)
        args.vocoder_name = model_item[
            'default_vocoder'] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None:
        vocoder_path, vocoder_config_path, _ = manager.download_model(
            args.vocoder_name)

    # CASE3: load custome models
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    # RUN THE SYNTHESIS
    # load models
    synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                              vocoder_config_path, args.use_cuda)

    print(" > Text: {}".format(args.text))

    # # handle multi-speaker setting
    # if not model_config.use_external_speaker_embedding_file and args.speaker_idx is not None:
    #     if args.speaker_idx.isdigit():
    #         args.speaker_idx = int(args.speaker_idx)
    #     else:
    #         args.speaker_idx = None
    # else:
    #     args.speaker_idx = None

    # if args.gst_style is None:
    #     if 'gst' in model_config.keys() and model_config.gst['gst_style_input'] is not None:
    #         gst_style = model_config.gst['gst_style_input']
    #     else:
    #         gst_style = None
    # else:
    #     # check if gst_style string is a dict, if is dict convert  else use string
    #     try:
    #         gst_style = json.loads(args.gst_style)
    #         if max(map(int, gst_style.keys())) >= model_config.gst['gst_style_tokens']:
    #             raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), model_config.gst['gst_style_tokens']))
    #     except ValueError:
    #         gst_style = args.gst_style

    # kick it
    wav = synthesizer.tts(args.text)

    # save the results
    file_name = args.text.replace(" ", "_")[0:20]
    file_name = file_name.translate(
        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
    out_path = os.path.join(args.out_path, file_name)
    print(" > Saving output to {}".format(out_path))
    synthesizer.save_wav(
        wav,
        out_path,
    )
Example #16
0
def load_synthesizer():
    MODEL_PATH = './voice/models/glow-tts/best_model.pth.tar'
    CONFIG_PATH = './voice/models/glow-tts/config.json'
    use_cuda = False
    synthesizer = Synthesizer(MODEL_PATH, CONFIG_PATH, use_cuda)
    return synthesizer
Example #17
0
if args.vocoder_name is not None:
    vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(args.vocoder_name)

# If these were not specified in the CLI args, use default values with embedded model files
if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
    args.tts_checkpoint = tts_checkpoint_file
if not args.tts_config and os.path.isfile(tts_config_file):
    args.tts_config = tts_config_file

if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
    args.vocoder_checkpoint = vocoder_checkpoint_file
if not args.vocoder_config and os.path.isfile(vocoder_config_file):
    args.vocoder_config = vocoder_config_file

synthesizer = Synthesizer(
    args.tts_checkpoint, args.tts_config, args.vocoder_checkpoint, args.vocoder_config, args.use_cuda
)

app = Flask(__name__)


@app.route("/")
def index():
    return render_template("index.html", show_details=args.show_details)


@app.route("/details")
def details():
    model_config = load_config(args.tts_config)
    if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
        vocoder_config = load_config(args.vocoder_config)
Example #18
0
def main():
    # pylint: disable=bad-option-value
    parser = argparse.ArgumentParser(
        description="""Synthesize speech on command line.\n\n"""
        """You can either use your trained model or choose a model from the provided list.\n\n"""
        """If you don't specify any models, then it uses LJSpeech based English model.\n\n"""
        """
    # Example Runs:

    ## Single Speaker Models

    - list provided models

    ```
    $ ./TTS/bin/synthesize.py --list_models
    ```

    - run tts with default models.

    ```
    $ ./TTS/bin synthesize.py --text "Text for TTS"
    ```

    - run a tts model with its default vocoder model.

    ```
    $ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
    ```

    - run with specific tts and vocoder models from the list

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
    ```

    - run your own TTS model (Using Griffin-Lim Vocoder)

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
    ```

    - run your own TTS and Vocoder models
    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
    ```

    ## MULTI-SPEAKER MODELS

    - list the available speakers and choose as <speaker_id> among them.

    ```
    $ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
    ```

    - run the multi-speaker TTS model with the target speaker ID.

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
    ```

    - run your own multi-speaker TTS model.

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
    """,
        formatter_class=RawTextHelpFormatter,
    )

    parser.add_argument(
        "--list_models",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
        help="list available pre-trained tts and vocoder models.",
    )
    parser.add_argument("--text",
                        type=str,
                        default=None,
                        help="Text to generate speech.")

    # Args for running pre-trained TTS models.
    parser.add_argument(
        "--model_name",
        type=str,
        default="tts_models/en/ljspeech/tacotron2-DDC",
        help=
        "Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
    )
    parser.add_argument(
        "--vocoder_name",
        type=str,
        default=None,
        help=
        "Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
    )

    # Args for running custom models
    parser.add_argument("--config_path",
                        default=None,
                        type=str,
                        help="Path to model config file.")
    parser.add_argument(
        "--model_path",
        type=str,
        default=None,
        help="Path to model file.",
    )
    parser.add_argument(
        "--out_path",
        type=str,
        default="tts_output.wav",
        help="Output wav file path.",
    )
    parser.add_argument("--use_cuda",
                        type=bool,
                        help="Run model on CUDA.",
                        default=False)
    parser.add_argument(
        "--vocoder_path",
        type=str,
        help=
        "Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
        default=None,
    )
    parser.add_argument("--vocoder_config_path",
                        type=str,
                        help="Path to vocoder model config file.",
                        default=None)
    parser.add_argument(
        "--encoder_path",
        type=str,
        help="Path to speaker encoder model file.",
        default=None,
    )
    parser.add_argument("--encoder_config_path",
                        type=str,
                        help="Path to speaker encoder config file.",
                        default=None)

    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path",
                        type=str,
                        help="JSON file for multi-speaker model.",
                        default=None)
    parser.add_argument(
        "--speaker_idx",
        type=str,
        help="Target speaker ID for a multi-speaker TTS model.",
        default=None,
    )
    parser.add_argument(
        "--speaker_wav",
        nargs="+",
        help=
        "wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.",
        default=None,
    )
    parser.add_argument("--gst_style",
                        help="Wav path file for GST stylereference.",
                        default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    # aux args
    parser.add_argument(
        "--save_spectogram",
        type=bool,
        help=
        "If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False,
    )

    args = parser.parse_args()

    # print the description if either text or list_models is not set
    if args.text is None and not args.list_models and not args.list_speaker_idxs:
        parser.parse_args(["-h"])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    speakers_file_path = None
    vocoder_path = None
    vocoder_config_path = None
    encoder_path = None
    encoder_config_path = None

    # CASE1: list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(
            args.model_name)
        args.vocoder_name = model_item[
            "default_vocoder"] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(
            args.vocoder_name)

    # CASE3: set custome model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
        speakers_file_path = args.speakers_file_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    if args.encoder_path is not None:
        encoder_path = args.encoder_path
        encoder_config_path = args.encoder_config_path

    # load models
    synthesizer = Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        args.use_cuda,
    )

    # query speaker ids of a multi-speaker model.
    if args.list_speaker_idxs:
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
        print(synthesizer.speaker_manager.speaker_ids)
        return

    # check the arguments against a multi-speaker model.
    if synthesizer.tts_speakers_file and (not args.speaker_idx
                                          and not args.speaker_wav):
        print(
            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
        )
        return

    # RUN THE SYNTHESIS
    print(" > Text: {}".format(args.text))

    # kick it
    wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav)

    # save the results
    print(" > Saving output to {}".format(args.out_path))
    synthesizer.save_wav(wav, args.out_path)
Example #19
0
def main():
    description = """Synthesize speech on command line.

You can either use your trained model or choose a model from the provided list.

If you don't specify any models, then it uses LJSpeech based English model.

## Example Runs

### Single Speaker Models

- List provided models:

    ```
    $ tts --list_models
    ```

- Query info for model info by idx:

    ```
    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
    ```

- Query info for model info by full name:

    ```
    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
    ```

- Run TTS with default models:

    ```
    $ tts --text "Text for TTS"
    ```

- Run a TTS model with its default vocoder model:

    ```
    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
    ```

- Run with specific TTS and vocoder models from the list:

    ```
    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
    ```

- Run your own TTS model (Using Griffin-Lim Vocoder):

    ```
    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
    ```

- Run your own TTS and Vocoder models:
    ```
    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
    ```

### Multi-speaker Models

- List the available speakers and choose as <speaker_id> among them:

    ```
    $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
    ```

- Run the multi-speaker TTS model with the target speaker ID:

    ```
    $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
    ```

- Run your own multi-speaker TTS model:

    ```
    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
    """
    # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
    # documentation in sync more easily.
    parser = argparse.ArgumentParser(
        description=description.replace("    ```\n", ""),
        formatter_class=RawTextHelpFormatter,
    )

    parser.add_argument(
        "--list_models",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
        help="list available pre-trained TTS and vocoder models.",
    )

    parser.add_argument(
        "--model_info_by_idx",
        type=str,
        default=None,
        help="model info using query format: <model_type>/<model_query_idx>",
    )

    parser.add_argument(
        "--model_info_by_name",
        type=str,
        default=None,
        help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
    )

    parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")

    # Args for running pre-trained TTS models.
    parser.add_argument(
        "--model_name",
        type=str,
        default="tts_models/en/ljspeech/tacotron2-DDC",
        help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
    )
    parser.add_argument(
        "--vocoder_name",
        type=str,
        default=None,
        help="Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
    )

    # Args for running custom models
    parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
    parser.add_argument(
        "--model_path",
        type=str,
        default=None,
        help="Path to model file.",
    )
    parser.add_argument(
        "--out_path",
        type=str,
        default="tts_output.wav",
        help="Output wav file path.",
    )
    parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
    parser.add_argument(
        "--vocoder_path",
        type=str,
        help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
        default=None,
    )
    parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
    parser.add_argument(
        "--encoder_path",
        type=str,
        help="Path to speaker encoder model file.",
        default=None,
    )
    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)

    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
    parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
    parser.add_argument(
        "--speaker_idx",
        type=str,
        help="Target speaker ID for a multi-speaker TTS model.",
        default=None,
    )
    parser.add_argument(
        "--language_idx",
        type=str,
        help="Target language ID for a multi-lingual TTS model.",
        default=None,
    )
    parser.add_argument(
        "--speaker_wav",
        nargs="+",
        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
        default=None,
    )
    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
    parser.add_argument(
        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
    )
    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    parser.add_argument(
        "--list_language_idxs",
        help="List available language ids for the defined multi-lingual model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    # aux args
    parser.add_argument(
        "--save_spectogram",
        type=bool,
        help="If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False,
    )
    parser.add_argument(
        "--reference_wav",
        type=str,
        help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
        default=None,
    )
    parser.add_argument(
        "--reference_speaker_idx",
        type=str,
        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
        default=None,
    )
    args = parser.parse_args()

    # print the description if either text or list_models is not set
    check_args = [
        args.text,
        args.list_models,
        args.list_speaker_idxs,
        args.list_language_idxs,
        args.reference_wav,
        args.model_info_by_idx,
        args.model_info_by_name,
    ]
    if not any(check_args):
        parser.parse_args(["-h"])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    speakers_file_path = None
    language_ids_file_path = None
    vocoder_path = None
    vocoder_config_path = None
    encoder_path = None
    encoder_config_path = None

    # CASE1 #list : list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2 #info : model info of pre-trained TTS models
    if args.model_info_by_idx:
        model_query = args.model_info_by_idx
        manager.model_info_by_idx(model_query)
        sys.exit()

    if args.model_info_by_name:
        model_query_full_name = args.model_info_by_name
        manager.model_info_by_full_name(model_query_full_name)
        sys.exit()

    # CASE3: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(args.model_name)
        args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

    # CASE4: set custom model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
        speakers_file_path = args.speakers_file_path
        language_ids_file_path = args.language_ids_file_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    if args.encoder_path is not None:
        encoder_path = args.encoder_path
        encoder_config_path = args.encoder_config_path

    # load models
    synthesizer = Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        language_ids_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        args.use_cuda,
    )

    # query speaker ids of a multi-speaker model.
    if args.list_speaker_idxs:
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
        print(synthesizer.tts_model.speaker_manager.ids)
        return

    # query langauge ids of a multi-lingual model.
    if args.list_language_idxs:
        print(
            " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
        )
        print(synthesizer.tts_model.language_manager.ids)
        return

    # check the arguments against a multi-speaker model.
    if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
        print(
            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
        )
        return

    # RUN THE SYNTHESIS
    if args.text:
        print(" > Text: {}".format(args.text))

    # kick it
    wav = synthesizer.tts(
        args.text,
        args.speaker_idx,
        args.language_idx,
        args.speaker_wav,
        reference_wav=args.reference_wav,
        style_wav=args.capacitron_style_wav,
        style_text=args.capacitron_style_text,
        reference_speaker_name=args.reference_speaker_idx,
    )

    # save the results
    print(" > Saving output to {}".format(args.out_path))
    synthesizer.save_wav(wav, args.out_path)
Example #20
0
if args.model_path is not None:
    model_path = args.model_path
    config_path = args.config_path
    speakers_file_path = args.speakers_file_path

if args.vocoder_path is not None:
    vocoder_path = args.vocoder_path
    vocoder_config_path = args.vocoder_config_path

# load models
synthesizer = Synthesizer(
    tts_checkpoint=model_path,
    tts_config_path=config_path,
    tts_speakers_file=speakers_file_path,
    tts_languages_file=None,
    vocoder_checkpoint=vocoder_path,
    vocoder_config=vocoder_config_path,
    encoder_checkpoint="",
    encoder_config="",
    use_cuda=args.use_cuda,
)

use_multi_speaker = hasattr(
    synthesizer.tts_model,
    "num_speakers") and (synthesizer.tts_model.num_speakers > 1
                         or synthesizer.tts_speakers_file is not None)

speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
# TODO: set this from SpeakerManager
use_gst = synthesizer.tts_config.get("use_gst", False)
app = Flask(__name__)
Example #21
0
        args.vocoder_name)

# CASE3: set custome model paths
if args.model_path is not None:
    model_path = args.model_path
    config_path = args.config_path
    speakers_file_path = args.speakers_file_path

if args.vocoder_path is not None:
    vocoder_path = args.vocoder_path
    vocoder_config_path = args.vocoder_config_path

# load models
synthesizer = Synthesizer(model_path,
                          config_path,
                          speakers_file_path,
                          vocoder_path,
                          vocoder_config_path,
                          use_cuda=args.use_cuda)

use_multi_speaker = synthesizer.speaker_manager is not None
# TODO: set this from SpeakerManager
use_gst = synthesizer.tts_config.get("use_gst", False)
app = Flask(__name__)


def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
    """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
    or a dict (gst tokens/values to be use for styling)

    Args:
        style_wav (str): uri
Example #22
0
            print("Link: " + v.text)
            links.append([v.text.strip(), v.attrs['href']])
    return (txt, links)


ar = len(sys.argv)
if (ar == 1):
    print("Please use a URL as the first argument")
    quit()

url = sys.argv[1]
#txt = cleantext(txt)
#txt2 = summ(txt)
#print(txt2)
hashindex = HashIndex(url)
print(hashindex.index)

path = '/home/osiris/.local/share/tts/'

model_path = path + 'tts_models--en--ljspeech--tacotron2-DCA/model_file.pth.tar'
config_path = path + 'tts_models--en--ljspeech--tacotron2-DCA/config.json'
vocoder_path = path + 'vocoder_models--en--ljspeech--mulitband-melgan/model_file.pth.tar'
vocoder_config_path = path + 'vocoder_models--en--ljspeech--mulitband-melgan/config.json'

synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                          vocoder_config_path, False)
#playall(txt2)

(txt, links) = getPageData(url)
go(txt)